aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch2655
1 files changed, 2655 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch
new file mode 100644
index 00000000..4821ab4b
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3119-drm-amdkfd-Replace-gfx10-trap-handler-with-correct-b.patch
@@ -0,0 +1,2655 @@
+From 2a3d8424c1f5d22418083a272bff16dc3c79ed05 Mon Sep 17 00:00:00 2001
+From: Jay Cornwall <jay.cornwall@amd.com>
+Date: Wed, 24 Jul 2019 12:23:42 -0500
+Subject: [PATCH 3119/4256] drm/amdkfd: Replace gfx10 trap handler with correct
+ branch
+
+Previously submitted code was taken from an incorrect branch and
+was non-functional.
+
+Cc: Oak Zeng <oak.zeng@amd.com>
+Signed-off-by: Jay Cornwall <jay.cornwall@amd.com>
+Acked-by: Alex Deucher <alexander.deucher@amd.com>
+Reviewed-By: Oak Zeng <oak.zeng@amd.com>
+---
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 553 +++--
+ .../amd/amdkfd/cwsr_trap_handler_gfx10.asm | 1978 ++++++++---------
+ 2 files changed, 1220 insertions(+), 1311 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+index 427594035597..2b3d7017f142 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+@@ -680,24 +680,47 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
+ };
+
+ static const uint32_t cwsr_trap_gfx10_hex[] = {
+- 0xbf820001, 0xbf82012e,
+- 0xb0804004, 0xb970f802,
+- 0x8a708670, 0xb971f803,
+- 0x8771ff71, 0x00000400,
+- 0xbf850008, 0xb971f803,
+- 0x8771ff71, 0x000001ff,
+- 0xbf850001, 0x806c846c,
++ 0xbf820001, 0xbf8201b2,
++ 0xb0804004, 0xb978f802,
++ 0x8a788678, 0xb971f803,
++ 0x876eff71, 0x00000400,
++ 0xbf850033, 0x876eff71,
++ 0x00000100, 0xbf840002,
++ 0x8878ff78, 0x00002000,
++ 0x8a77ff77, 0xff000000,
++ 0xb96ef807, 0x876fff6e,
++ 0x02000000, 0x8f6f866f,
++ 0x88776f77, 0x876fff6e,
++ 0x003f8000, 0x8f6f896f,
++ 0x88776f77, 0x8a6eff6e,
++ 0x023f8000, 0xb9eef807,
++ 0xb970f812, 0xb971f813,
++ 0x8ff08870, 0xf4051bb8,
++ 0xfa000000, 0xbf8cc07f,
++ 0xf4051c38, 0xfa000008,
++ 0xbf8cc07f, 0x87ee6e6e,
++ 0xbf840001, 0xbe80206e,
++ 0xb971f803, 0x8771ff71,
++ 0x000001ff, 0xbf850002,
++ 0x806c846c, 0x826d806d,
++ 0x876dff6d, 0x0000ffff,
++ 0x906e8977, 0x876fff6e,
++ 0x003f8000, 0x906e8677,
++ 0x876eff6e, 0x02000000,
++ 0x886e6f6e, 0xb9eef807,
++ 0x87fe7e7e, 0x87ea6a6a,
++ 0xb9f8f802, 0xbe80226c,
++ 0xb971f803, 0x8771ff71,
++ 0x00000100, 0xbf840006,
++ 0xbef60380, 0xb9f60203,
+ 0x876dff6d, 0x0000ffff,
+- 0xbe80226c, 0xb971f803,
+- 0x8771ff71, 0x00000100,
+- 0xbf840006, 0xbef60380,
+- 0xb9f60203, 0x876dff6d,
+- 0x0000ffff, 0x80ec886c,
+- 0x82ed806d, 0xbef60380,
+- 0xb9f60283, 0xb973f816,
+- 0xb9762c07, 0x8f769c76,
+- 0x886d766d, 0xb97603c7,
+- 0x8f769b76, 0x886d766d,
++ 0x80ec886c, 0x82ed806d,
++ 0xbef60380, 0xb9f60283,
++ 0xb972f816, 0xb9762c07,
++ 0x8f769a76, 0x886d766d,
++ 0xb97603c7, 0x8f769976,
++ 0x886d766d, 0xb9760647,
++ 0x8f769876, 0x886d766d,
+ 0xb976f807, 0x8776ff76,
+ 0x00007fff, 0xb9f6f807,
+ 0xbeee037e, 0xbeef037f,
+@@ -706,32 +729,167 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
+ 0xbef4037e, 0x8775ff7f,
+ 0x0000ffff, 0x8875ff75,
+ 0x00040000, 0xbef60380,
+- 0xbef703ff, 0x00807fac,
++ 0xbef703ff, 0x10807fac,
+ 0x8776ff7f, 0x08000000,
+ 0x90768376, 0x88777677,
+ 0x8776ff7f, 0x70000000,
+ 0x90768176, 0x88777677,
+ 0xbefb037c, 0xbefa0380,
+- 0xb97202dc, 0x8872727f,
+- 0xbefe03c1, 0x877c8172,
++ 0xb97302dc, 0x8f739973,
++ 0x8873737f, 0xb97a2a05,
++ 0x807a817a, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
++ 0xbf850002, 0x8f7a897a,
++ 0xbf820001, 0x8f7a8a7a,
++ 0xb9761e06, 0x8f768a76,
++ 0x807a767a, 0x807aff7a,
++ 0x00000200, 0xbef603ff,
++ 0x01000000, 0xbefe037c,
++ 0xbefc037a, 0xf4611efa,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xbefe037c,
++ 0xbefc037a, 0xf4611b3a,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xbefe037c,
++ 0xbefc037a, 0xf4611b7a,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xbefe037c,
++ 0xbefc037a, 0xf4611bba,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xbefe037c,
++ 0xbefc037a, 0xf4611bfa,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xbefe037c,
++ 0xbefc037a, 0xf4611e3a,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0xb971f803,
++ 0xbefe037c, 0xbefc037a,
++ 0xf4611c7a, 0xf8000000,
++ 0x807a847a, 0xbefc037e,
++ 0xbefe037c, 0xbefc037a,
++ 0xf4611cba, 0xf8000000,
++ 0x807a847a, 0xbefc037e,
++ 0xb97bf801, 0xbefe037c,
++ 0xbefc037a, 0xf4611efa,
++ 0xf8000000, 0x807a847a,
++ 0xbefc037e, 0x8776ff7f,
++ 0x04000000, 0xbeef0380,
++ 0x886f6f76, 0xb97a2a05,
++ 0x807a817a, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
++ 0xbf850002, 0x8f7a897a,
++ 0xbf820001, 0x8f7a8a7a,
++ 0xb9761e06, 0x8f768a76,
++ 0x807a767a, 0xbef603ff,
++ 0x01000000, 0xbef20374,
++ 0x80747a74, 0x82758075,
++ 0xbefc0380, 0xbf800000,
++ 0xbe802f00, 0xbe822f02,
++ 0xbe842f04, 0xbe862f06,
++ 0xbe882f08, 0xbe8a2f0a,
++ 0xbe8c2f0c, 0xbe8e2f0e,
++ 0xf469003a, 0xfa000000,
++ 0xf469013a, 0xfa000010,
++ 0xf469023a, 0xfa000020,
++ 0xf469033a, 0xfa000030,
++ 0x8074c074, 0x82758075,
++ 0x807c907c, 0xbf0aff7c,
++ 0x00000060, 0xbf85ffea,
++ 0xbe802f00, 0xbe822f02,
++ 0xbe842f04, 0xbe862f06,
++ 0xbe882f08, 0xf469003a,
++ 0xfa000000, 0xf469013a,
++ 0xfa000010, 0xf465023a,
++ 0xfa000020, 0x8074c074,
++ 0x82758075, 0xbef40372,
++ 0xbefa0380, 0xbefe03c1,
++ 0x907c9973, 0x877c817c,
+ 0xbf06817c, 0xbf850002,
+- 0xbeff0380, 0xbf820001,
+- 0xbeff03c1, 0xb9712a05,
+- 0x80718171, 0x8f718271,
+- 0x877c8172, 0xbf06817c,
+- 0xbf85000d, 0x8f768771,
++ 0xbeff0380, 0xbf820002,
++ 0xbeff03c1, 0xbf82000b,
+ 0xbef603ff, 0x01000000,
+- 0xbefc0380, 0x7e008700,
+ 0xe0704000, 0x7a5d0000,
+- 0x807c817c, 0x807aff7a,
+- 0x00000080, 0xbf0a717c,
+- 0xbf85fff8, 0xbf82001b,
+- 0x8f768871, 0xbef603ff,
+- 0x01000000, 0xbefc0380,
+- 0x7e008700, 0xe0704000,
+- 0x7a5d0000, 0x807c817c,
+- 0x807aff7a, 0x00000100,
+- 0xbf0a717c, 0xbf85fff8,
++ 0xe0704080, 0x7a5d0100,
++ 0xe0704100, 0x7a5d0200,
++ 0xe0704180, 0x7a5d0300,
++ 0xbf82000a, 0xbef603ff,
++ 0x01000000, 0xe0704000,
++ 0x7a5d0000, 0xe0704100,
++ 0x7a5d0100, 0xe0704200,
++ 0x7a5d0200, 0xe0704300,
++ 0x7a5d0300, 0xbefe03c1,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850002,
++ 0xbeff0380, 0xbf820001,
++ 0xbeff03c1, 0xb9714306,
++ 0x8771c171, 0xbf840046,
++ 0xbf8a0000, 0x8776ff6f,
++ 0x04000000, 0xbf840042,
++ 0x8f718671, 0x8f718271,
++ 0xbef60371, 0xb97a2a05,
++ 0x807a817a, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
++ 0xbf850002, 0x8f7a897a,
++ 0xbf820001, 0x8f7a8a7a,
++ 0xb9761e06, 0x8f768a76,
++ 0x807a767a, 0x807aff7a,
++ 0x00000200, 0x807aff7a,
++ 0x00000080, 0xbef603ff,
++ 0x01000000, 0xd7650000,
++ 0x000100c1, 0xd7660000,
++ 0x000200c1, 0x16000084,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbefc0380,
++ 0xbf850012, 0xbe8303ff,
++ 0x00000080, 0xbf800000,
++ 0xbf800000, 0xbf800000,
++ 0xd8d80000, 0x01000000,
++ 0xbf8c0000, 0xe0704000,
++ 0x7a5d0100, 0x807c037c,
++ 0x807a037a, 0xd5250000,
++ 0x0001ff00, 0x00000080,
++ 0xbf0a717c, 0xbf85fff4,
++ 0xbf820011, 0xbe8303ff,
++ 0x00000100, 0xbf800000,
++ 0xbf800000, 0xbf800000,
++ 0xd8d80000, 0x01000000,
++ 0xbf8c0000, 0xe0704000,
++ 0x7a5d0100, 0x807c037c,
++ 0x807a037a, 0xd5250000,
++ 0x0001ff00, 0x00000100,
++ 0xbf0a717c, 0xbf85fff4,
++ 0xbefe03c1, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
++ 0xbf850004, 0xbefa03ff,
++ 0x00000200, 0xbeff0380,
++ 0xbf820003, 0xbefa03ff,
++ 0x00000400, 0xbeff03c1,
++ 0xb9712a05, 0x80718171,
++ 0x8f718271, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
++ 0xbf850017, 0xbef603ff,
++ 0x01000000, 0xbefc0384,
++ 0xbf0a717c, 0xbf840037,
++ 0x7e008700, 0x7e028701,
++ 0x7e048702, 0x7e068703,
++ 0xe0704000, 0x7a5d0000,
++ 0xe0704080, 0x7a5d0100,
++ 0xe0704100, 0x7a5d0200,
++ 0xe0704180, 0x7a5d0300,
++ 0x807c847c, 0x807aff7a,
++ 0x00000200, 0xbf0a717c,
++ 0xbf85ffef, 0xbf820025,
++ 0xbef603ff, 0x01000000,
++ 0xbefc0384, 0xbf0a717c,
++ 0xbf840020, 0x7e008700,
++ 0x7e028701, 0x7e048702,
++ 0x7e068703, 0xe0704000,
++ 0x7a5d0000, 0xe0704100,
++ 0x7a5d0100, 0xe0704200,
++ 0x7a5d0200, 0xe0704300,
++ 0x7a5d0300, 0x807c847c,
++ 0x807aff7a, 0x00000400,
++ 0xbf0a717c, 0xbf85ffef,
+ 0xb9711e06, 0x8771c171,
+ 0xbf84000c, 0x8f718371,
+ 0x80717c71, 0xbefe03c1,
+@@ -739,133 +897,82 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
+ 0xe0704000, 0x7a5d0000,
+ 0x807c817c, 0x807aff7a,
+ 0x00000080, 0xbf0a717c,
+- 0xbf85fff8, 0xbf8a0000,
+- 0x8776ff72, 0x04000000,
+- 0xbf84002b, 0xbefe03c1,
+- 0x877c8172, 0xbf06817c,
+- 0xbf850002, 0xbeff0380,
+- 0xbf820001, 0xbeff03c1,
+- 0xb9714306, 0x8771c171,
+- 0xbf840021, 0x8f718671,
+- 0x8f718271, 0xbef60371,
+- 0xbef603ff, 0x01000000,
+- 0xd7650000, 0x000100c1,
+- 0xd7660000, 0x000200c1,
+- 0x16000084, 0x877c8172,
+- 0xbf06817c, 0xbefc0380,
+- 0xbf85000a, 0x807cff7c,
+- 0x00000080, 0x807aff7a,
+- 0x00000080, 0xd5250000,
+- 0x0001ff00, 0x00000080,
+- 0xbf0a717c, 0xbf85fff7,
+- 0xbf820009, 0x807cff7c,
+- 0x00000100, 0x807aff7a,
+- 0x00000100, 0xd5250000,
+- 0x0001ff00, 0x00000100,
+- 0xbf0a717c, 0xbf85fff7,
+- 0x877c8172, 0xbf06817c,
+- 0xbf850003, 0x8f7687ff,
+- 0x0000006a, 0xbf820002,
+- 0x8f7688ff, 0x0000006a,
+- 0xbef603ff, 0x01000000,
+- 0x877c8172, 0xbf06817c,
+- 0xbefc0380, 0xbf800000,
+- 0xbf85000b, 0xbe802e00,
+- 0x7e000200, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0x807c817c,
+- 0xbf0aff7c, 0x0000006a,
+- 0xbf85fff6, 0xbf82000a,
+- 0xbe802e00, 0x7e000200,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000100,
+- 0x807c817c, 0xbf0aff7c,
+- 0x0000006a, 0xbf85fff6,
+- 0xbef60384, 0xbef603ff,
+- 0x01000000, 0x877c8172,
+- 0xbf06817c, 0xbf850030,
+- 0x7e00027b, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0x7e00026c,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000080,
+- 0x7e00026d, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0x7e00026e,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000080,
+- 0x7e00026f, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0x7e000270,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000080,
+- 0xb971f803, 0x7e000271,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000080,
+- 0x7e000273, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0xb97bf801,
+- 0x7e00027b, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000080, 0xbf82002f,
+- 0x7e00027b, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000100, 0x7e00026c,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000100,
+- 0x7e00026d, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000100, 0x7e00026e,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000100,
+- 0x7e00026f, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000100, 0x7e000270,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000100,
+- 0xb971f803, 0x7e000271,
+- 0xe0704000, 0x7a5d0000,
+- 0x807aff7a, 0x00000100,
+- 0x7e000273, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000100, 0xb97bf801,
+- 0x7e00027b, 0xe0704000,
+- 0x7a5d0000, 0x807aff7a,
+- 0x00000100, 0xbf820119,
++ 0xbf85fff8, 0xbf820138,
+ 0xbef4037e, 0x8775ff7f,
+ 0x0000ffff, 0x8875ff75,
+ 0x00040000, 0xbef60380,
+- 0xbef703ff, 0x00807fac,
++ 0xbef703ff, 0x10807fac,
+ 0x8772ff7f, 0x08000000,
+ 0x90728372, 0x88777277,
+ 0x8772ff7f, 0x70000000,
+ 0x90728172, 0x88777277,
+- 0xb97902dc, 0x8879797f,
+- 0xbef80380, 0xbefe03c1,
+- 0x877c8179, 0xbf06817c,
++ 0xb97302dc, 0x8f739973,
++ 0x8873737f, 0x8772ff7f,
++ 0x04000000, 0xbf840036,
++ 0xbefe03c1, 0x907c9973,
++ 0x877c817c, 0xbf06817c,
+ 0xbf850002, 0xbeff0380,
+ 0xbf820001, 0xbeff03c1,
+- 0xb96f2a05, 0x806f816f,
+- 0x8f6f826f, 0x877c8179,
+- 0xbf06817c, 0xbf850013,
+- 0x8f76876f, 0xbef603ff,
+- 0x01000000, 0xbef20378,
+- 0x8078ff78, 0x00000080,
+- 0xbefc0381, 0xe0304000,
+- 0x785d0000, 0xbf8c3f70,
+- 0x7e008500, 0x807c817c,
++ 0xb96f4306, 0x876fc16f,
++ 0xbf84002b, 0x8f6f866f,
++ 0x8f6f826f, 0xbef6036f,
++ 0xb9782a05, 0x80788178,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850002,
++ 0x8f788978, 0xbf820001,
++ 0x8f788a78, 0xb9721e06,
++ 0x8f728a72, 0x80787278,
++ 0x8078ff78, 0x00000200,
+ 0x8078ff78, 0x00000080,
+- 0xbf0a6f7c, 0xbf85fff7,
+- 0xe0304000, 0x725d0000,
+- 0xbf820023, 0x8f76886f,
++ 0xbef603ff, 0x01000000,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbefc0380,
++ 0xbf850009, 0xe0310000,
++ 0x781d0000, 0x807cff7c,
++ 0x00000080, 0x8078ff78,
++ 0x00000080, 0xbf0a6f7c,
++ 0xbf85fff8, 0xbf820008,
++ 0xe0310000, 0x781d0000,
++ 0x807cff7c, 0x00000100,
++ 0x8078ff78, 0x00000100,
++ 0xbf0a6f7c, 0xbf85fff8,
++ 0xbef80380, 0xbefe03c1,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850002,
++ 0xbeff0380, 0xbf820001,
++ 0xbeff03c1, 0xb96f2a05,
++ 0x806f816f, 0x8f6f826f,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850021,
+ 0xbef603ff, 0x01000000,
+ 0xbef20378, 0x8078ff78,
+- 0x00000100, 0xbefc0381,
++ 0x00000200, 0xbefc0384,
+ 0xe0304000, 0x785d0000,
++ 0xe0304080, 0x785d0100,
++ 0xe0304100, 0x785d0200,
++ 0xe0304180, 0x785d0300,
+ 0xbf8c3f70, 0x7e008500,
+- 0x807c817c, 0x8078ff78,
+- 0x00000100, 0xbf0a6f7c,
+- 0xbf85fff7, 0xb96f1e06,
++ 0x7e028501, 0x7e048502,
++ 0x7e068503, 0x807c847c,
++ 0x8078ff78, 0x00000200,
++ 0xbf0a6f7c, 0xbf85ffee,
++ 0xe0304000, 0x725d0000,
++ 0xe0304080, 0x725d0100,
++ 0xe0304100, 0x725d0200,
++ 0xe0304180, 0x725d0300,
++ 0xbf820031, 0xbef603ff,
++ 0x01000000, 0xbef20378,
++ 0x8078ff78, 0x00000400,
++ 0xbefc0384, 0xe0304000,
++ 0x785d0000, 0xe0304100,
++ 0x785d0100, 0xe0304200,
++ 0x785d0200, 0xe0304300,
++ 0x785d0300, 0xbf8c3f70,
++ 0x7e008500, 0x7e028501,
++ 0x7e048502, 0x7e068503,
++ 0x807c847c, 0x8078ff78,
++ 0x00000400, 0xbf0a6f7c,
++ 0xbf85ffee, 0xb96f1e06,
+ 0x876fc16f, 0xbf84000e,
+ 0x8f6f836f, 0x806f7c6f,
+ 0xbefe03c1, 0xbeff0380,
+@@ -875,107 +982,81 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
+ 0x00000080, 0xbf0a6f7c,
+ 0xbf85fff7, 0xbeff03c1,
+ 0xe0304000, 0x725d0000,
+- 0x8772ff79, 0x04000000,
+- 0xbf840020, 0xbefe03c1,
+- 0x877c8179, 0xbf06817c,
+- 0xbf850002, 0xbeff0380,
+- 0xbf820001, 0xbeff03c1,
+- 0xb96f4306, 0x876fc16f,
+- 0xbf840016, 0x8f6f866f,
+- 0x8f6f826f, 0xbef6036f,
+- 0xbef603ff, 0x01000000,
+- 0x877c8172, 0xbf06817c,
+- 0xbefc0380, 0xbf850007,
+- 0x807cff7c, 0x00000080,
+- 0x8078ff78, 0x00000080,
+- 0xbf0a6f7c, 0xbf85fffa,
+- 0xbf820006, 0x807cff7c,
+- 0x00000100, 0x8078ff78,
+- 0x00000100, 0xbf0a6f7c,
+- 0xbf85fffa, 0x877c8179,
+- 0xbf06817c, 0xbf850003,
+- 0x8f7687ff, 0x0000006a,
+- 0xbf820002, 0x8f7688ff,
+- 0x0000006a, 0xbef603ff,
+- 0x01000000, 0x877c8179,
+- 0xbf06817c, 0xbf850012,
+- 0xf4211cba, 0xf0000000,
+- 0x8078ff78, 0x00000080,
+- 0xbefc0381, 0xf421003a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xbf8cc07f,
+- 0xbe803000, 0xbf800000,
+- 0x807c817c, 0xbf0aff7c,
+- 0x0000006a, 0xbf85fff5,
+- 0xbe800372, 0xbf820011,
+- 0xf4211cba, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xbefc0381, 0xf421003a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000100, 0xbf8cc07f,
+- 0xbe803000, 0xbf800000,
+- 0x807c817c, 0xbf0aff7c,
+- 0x0000006a, 0xbf85fff5,
+- 0xbe800372, 0xbef60384,
++ 0xe0304080, 0x725d0100,
++ 0xe0304100, 0x725d0200,
++ 0xe0304180, 0x725d0300,
++ 0xb9782a05, 0x80788178,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850002,
++ 0x8f788978, 0xbf820001,
++ 0x8f788a78, 0xb9721e06,
++ 0x8f728a72, 0x80787278,
++ 0x8078ff78, 0x00000200,
++ 0x80f8ff78, 0x00000058,
++ 0x80f88878, 0xbef603ff,
++ 0x01000000, 0xbefc03ff,
++ 0x0000006a, 0xf425003a,
++ 0xf0000000, 0x80f8a078,
++ 0xbf8cc07f, 0x80fc827c,
++ 0xbf800000, 0xbe803100,
++ 0xf42d003a, 0xf0000000,
++ 0x80f8c078, 0xbf8cc07f,
++ 0x80fc887c, 0xbf800000,
++ 0xbe803100, 0xbe823102,
++ 0xbe843104, 0xbe863106,
++ 0xf431003a, 0xf0000000,
++ 0x80f8c078, 0xbf8cc07f,
++ 0x80fc907c, 0xbf800000,
++ 0xbe803100, 0xbe823102,
++ 0xbe843104, 0xbe863106,
++ 0xbe883108, 0xbe8a310a,
++ 0xbe8c310c, 0xbe8e310e,
++ 0xbf06807c, 0xbf84fff0,
++ 0xb9782a05, 0x80788178,
++ 0x907c9973, 0x877c817c,
++ 0xbf06817c, 0xbf850002,
++ 0x8f788978, 0xbf820001,
++ 0x8f788a78, 0xb9721e06,
++ 0x8f728a72, 0x80787278,
++ 0x8078ff78, 0x00000200,
+ 0xbef603ff, 0x01000000,
+- 0x877c8179, 0xbf06817c,
+- 0xbf850025, 0xf4211bfa,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211b3a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211b7a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211eba,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211efa,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211c3a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211c7a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211cfa,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xf4211e7a,
+- 0xf0000000, 0x8078ff78,
+- 0x00000080, 0xbf820024,
+ 0xf4211bfa, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xf4211b3a, 0xf0000000,
+- 0x8078ff78, 0x00000100,
++ 0x80788478, 0xf4211b3a,
++ 0xf0000000, 0x80788478,
+ 0xf4211b7a, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xf4211eba, 0xf0000000,
+- 0x8078ff78, 0x00000100,
++ 0x80788478, 0xf4211eba,
++ 0xf0000000, 0x80788478,
+ 0xf4211efa, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xf4211c3a, 0xf0000000,
+- 0x8078ff78, 0x00000100,
++ 0x80788478, 0xf4211c3a,
++ 0xf0000000, 0x80788478,
+ 0xf4211c7a, 0xf0000000,
+- 0x8078ff78, 0x00000100,
++ 0x80788478, 0xf4211e7a,
++ 0xf0000000, 0x80788478,
+ 0xf4211cfa, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xf4211e7a, 0xf0000000,
+- 0x8078ff78, 0x00000100,
+- 0xbf8cc07f, 0x876dff6d,
++ 0x80788478, 0xbf8cc07f,
++ 0xbef2036d, 0x876dff72,
+ 0x0000ffff, 0xbefc036f,
+ 0xbefe037a, 0xbeff037b,
+ 0x876f71ff, 0x000003ff,
+- 0xb9ef4803, 0xb9f3f816,
++ 0xb9ef4803, 0xb9f9f816,
+ 0x876f71ff, 0xfffff800,
+ 0x906f8b6f, 0xb9efa2c3,
+- 0xb9f9f801, 0x876fff6d,
+- 0xf0000000, 0x906f9c6f,
+- 0x8f6f906f, 0xbef20380,
+- 0x88726f72, 0x876fff6d,
+- 0x08000000, 0x906f9b6f,
+- 0x8f6f8f6f, 0x88726f72,
+- 0x876fff70, 0x00800000,
+- 0x906f976f, 0xb9f2f807,
+- 0xb9f0f802, 0xbf8a0000,
+- 0xbe80226c, 0xbf810000,
++ 0xb9f3f801, 0x876fff72,
++ 0xfc000000, 0x906f9a6f,
++ 0x8f6f906f, 0xbef30380,
++ 0x88736f73, 0x876fff72,
++ 0x02000000, 0x906f996f,
++ 0x8f6f8f6f, 0x88736f73,
++ 0x876fff72, 0x01000000,
++ 0x906f986f, 0x8f6f996f,
++ 0x88736f73, 0x876fff70,
++ 0x00800000, 0x906f976f,
++ 0xb9f3f807, 0x87fe7e7e,
++ 0x87ea6a6a, 0xb9f0f802,
++ 0xbf8a0000, 0xbe80226c,
++ 0xbf810000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+ 0xbf9f0000, 0xbf9f0000,
+- 0xbf9f0000, 0x00000000,
+ };
+ static const uint32_t cwsr_trap_arcturus_hex[] = {
+ 0xbf820001, 0xbf8202c4,
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+index f20e463e748b..261e05430852 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+@@ -20,1105 +20,933 @@
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
++var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
++var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
++var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
++var SQ_WAVE_STATUS_HALT_MASK = 0x2000
++
++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4
++var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
++var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
++var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
++var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
++
++var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
++var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF
++var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
++var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
++var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
++var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
++var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800
++
++var SQ_WAVE_IB_STS_RCNT_SHIFT = 16
++var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15
++var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25
++var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE = 1
++var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000
++var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1
++var SQ_WAVE_IB_STS_RCNT_SIZE = 6
++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000
++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF
++
++var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
++var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
++
++// bits [31:24] unused by SPI debug data
++var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31
++var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000
++var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24
++var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000
++
++// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
++// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
++var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000
++var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC
++
++var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000
++var S_SAVE_SPI_INIT_ATC_SHIFT = 27
++var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000
++var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
++var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
++var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
++
++var S_SAVE_PC_HI_RCNT_SHIFT = 26
++var S_SAVE_PC_HI_RCNT_MASK = 0xFC000000
++var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 25
++var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x02000000
++var S_SAVE_PC_HI_REPLAY_W64H_SHIFT = 24
++var S_SAVE_PC_HI_REPLAY_W64H_MASK = 0x01000000
++
++var s_sgpr_save_num = 106
++
++var s_save_spi_init_lo = exec_lo
++var s_save_spi_init_hi = exec_hi
++var s_save_pc_lo = ttmp0
++var s_save_pc_hi = ttmp1
++var s_save_exec_lo = ttmp2
++var s_save_exec_hi = ttmp3
++var s_save_status = ttmp12
++var s_save_trapsts = ttmp5
++var s_save_xnack_mask = ttmp6
++var s_wave_size = ttmp7
++var s_save_buf_rsrc0 = ttmp8
++var s_save_buf_rsrc1 = ttmp9
++var s_save_buf_rsrc2 = ttmp10
++var s_save_buf_rsrc3 = ttmp11
++var s_save_mem_offset = ttmp14
++var s_save_alloc_size = s_save_trapsts
++var s_save_tmp = s_save_buf_rsrc2
++var s_save_m0 = ttmp15
++
++var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
++var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
++
++var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000
++var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
++var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000
++var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
++var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000
++var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
++var S_WAVE_SIZE = 25
++
++var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
++var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
++var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
++
++var s_restore_spi_init_lo = exec_lo
++var s_restore_spi_init_hi = exec_hi
++var s_restore_mem_offset = ttmp12
++var s_restore_alloc_size = ttmp3
++var s_restore_tmp = ttmp6
++var s_restore_mem_offset_save = s_restore_tmp
++var s_restore_m0 = s_restore_alloc_size
++var s_restore_mode = ttmp7
++var s_restore_pc_lo = ttmp0
++var s_restore_pc_hi = ttmp1
++var s_restore_exec_lo = ttmp14
++var s_restore_exec_hi = ttmp15
++var s_restore_status = ttmp4
++var s_restore_trapsts = ttmp5
++var s_restore_xnack_mask = ttmp13
++var s_restore_buf_rsrc0 = ttmp8
++var s_restore_buf_rsrc1 = ttmp9
++var s_restore_buf_rsrc2 = ttmp10
++var s_restore_buf_rsrc3 = ttmp11
++var s_restore_size = ttmp7
+
+ shader main
++ asic(DEFAULT)
++ type(CS)
++ wave_size(32)
+
+-asic(DEFAULT)
+-
+-type(CS)
+-
+-wave_size(32)
+-/*************************************************************************/
+-/* control on how to run the shader */
+-/*************************************************************************/
+-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+-var EMU_RUN_HACK = 0
+-var EMU_RUN_HACK_RESTORE_NORMAL = 0
+-var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0
+-var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0
+-var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK
+-var SAVE_LDS = 0
+-var WG_BASE_ADDR_LO = 0x9000a000
+-var WG_BASE_ADDR_HI = 0x0
+-var WAVE_SPACE = 0x9000 //memory size that each wave occupies in workgroup state mem, increase from 5000 to 9000 for more SGPR need to be saved
+-var CTX_SAVE_CONTROL = 0x0
+-var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL
+-var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+-var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write
+-var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC)
+-var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing
+-var SAVE_RESTORE_HWID_DDID = 0
+-var RESTORE_DDID_IN_SGPR18 = 0
+-/**************************************************************************/
+-/* variables */
+-/**************************************************************************/
+-var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23
+-var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000
+-var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006
+-
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24
+-var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4
+-var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11
+-var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1
+-
+-var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400
+-var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask
+-var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21
+-
+-var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME
+-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME
+-var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 //FIXME
+-var SQ_WAVE_IB_STS_RCNT_SIZE = 6 //FIXME
+-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME
+-
+-var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24
+-var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27
+-
+-
+-/* Save */
+-var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes
+-var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+-
+-var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+-var S_SAVE_SPI_INIT_ATC_SHIFT = 27
+-var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+-var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28
+-var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+-var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+-
+-var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used
+-var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME
+-
+-var s_save_spi_init_lo = exec_lo
+-var s_save_spi_init_hi = exec_hi
+-
+-var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+-var s_save_pc_hi = ttmp1
+-var s_save_exec_lo = ttmp2
+-var s_save_exec_hi = ttmp3
+-var s_save_status = ttmp4
+-var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine
+-var s_wave_size = ttmp6 //ttmp6 is not needed now, since it's only 32bit xnack mask, now use it to determine wave32 or wave64 in EMU_HACK
+-var s_save_xnack_mask = ttmp7
+-var s_save_buf_rsrc0 = ttmp8
+-var s_save_buf_rsrc1 = ttmp9
+-var s_save_buf_rsrc2 = ttmp10
+-var s_save_buf_rsrc3 = ttmp11
+-
+-var s_save_mem_offset = ttmp14
+-var s_sgpr_save_num = 106 //in gfx10, all sgpr must be saved
+-var s_save_alloc_size = s_save_trapsts //conflict
+-var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time)
+-var s_save_m0 = ttmp15
+-
+-/* Restore */
+-var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE
+-var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC
+-
+-var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit
+-var S_RESTORE_SPI_INIT_ATC_SHIFT = 27
+-var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype
+-var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26
+-
+-var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT
+-var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK
+-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK
+-
+-var s_restore_spi_init_lo = exec_lo
+-var s_restore_spi_init_hi = exec_hi
+-
+-var s_restore_mem_offset = ttmp12
+-var s_restore_alloc_size = ttmp3
+-var s_restore_tmp = ttmp6
+-var s_restore_mem_offset_save = s_restore_tmp //no conflict
+-
+-var s_restore_m0 = s_restore_alloc_size //no conflict
+-
+-var s_restore_mode = ttmp13
+-var s_restore_hwid1 = ttmp2
+-var s_restore_ddid = s_restore_hwid1
+-var s_restore_pc_lo = ttmp0
+-var s_restore_pc_hi = ttmp1
+-var s_restore_exec_lo = ttmp14
+-var s_restore_exec_hi = ttmp15
+-var s_restore_status = ttmp4
+-var s_restore_trapsts = ttmp5
+-//var s_restore_xnack_mask_lo = xnack_mask_lo
+-//var s_restore_xnack_mask_hi = xnack_mask_hi
+-var s_restore_xnack_mask = ttmp7
+-var s_restore_buf_rsrc0 = ttmp8
+-var s_restore_buf_rsrc1 = ttmp9
+-var s_restore_buf_rsrc2 = ttmp10
+-var s_restore_buf_rsrc3 = ttmp11
+-var s_restore_size = ttmp13 //ttmp13 has no conflict
+-
+-/**************************************************************************/
+-/* trap handler entry points */
+-/**************************************************************************/
+- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore
+- //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+- s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC
+- s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+- s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE
+- //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE
+- s_branch L_SKIP_RESTORE //NOT restore, SAVE actually
+- else
+- s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
+- end
++ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save
+
+ L_JUMP_TO_RESTORE:
+- s_branch L_RESTORE //restore
++ s_branch L_RESTORE
+
+ L_SKIP_RESTORE:
+-
+- s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
+- s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save
+- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
+- s_cbranch_scc1 L_SAVE //this is the operation for save
+-
+- // ********* Handle non-CWSR traps *******************
+- if (!EMU_RUN_HACK)
+- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+- s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
+- s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
+-
+- L_EXCP_CASE:
+- s_and_b32 ttmp1, ttmp1, 0xFFFF
+- s_rfe_b64 [ttmp0, ttmp1]
+- end
+- // ********* End handling of non-CWSR traps *******************
+-
+-/**************************************************************************/
+-/* save routine */
+-/**************************************************************************/
+-
+-L_SAVE:
+-
++ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC
++ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++ s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save
++ s_cbranch_scc1 L_SAVE
++
++ // If STATUS.MEM_VIOL is asserted then halt the wave to prevent
++ // the exception raising again and blocking context save.
++ s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
++ s_cbranch_scc0 L_FETCH_2ND_TRAP
++ s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
++
++L_FETCH_2ND_TRAP:
++ // Preserve and clear scalar XNACK state before issuing scalar loads.
++ // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
++ // unused space ttmp11[31:24].
++ s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
++ s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS)
++ s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
++ s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
++ s_or_b32 ttmp11, ttmp11, ttmp3
++ s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
++ s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
++ s_or_b32 ttmp11, ttmp11, ttmp3
++ s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
++ s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
++
++ // Read second-level TBA/TMA from first-level TMA and jump if available.
++ // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
++ // ttmp12 holds SQ_WAVE_STATUS
++ s_getreg_b32 ttmp4, hwreg(HW_REG_SHADER_TMA_LO)
++ s_getreg_b32 ttmp5, hwreg(HW_REG_SHADER_TMA_HI)
++ s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
++ s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
++ s_waitcnt lgkmcnt(0)
++ s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
++ s_waitcnt lgkmcnt(0)
++ s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
++ s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set
++ s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler
++
++L_NO_NEXT_TRAP:
++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
++ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly.
++ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0
++ s_addc_u32 ttmp1, ttmp1, 0
++L_EXCP_CASE:
++ s_and_b32 ttmp1, ttmp1, 0xFFFF
++
++ // Restore SQ_WAVE_IB_STS.
++ s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
++ s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
++ s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
++ s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
++ s_or_b32 ttmp2, ttmp2, ttmp3
++ s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2
++
++ // Restore SQ_WAVE_STATUS.
++ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
++ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status
++
++ s_rfe_b64 [ttmp0, ttmp1]
++
++L_SAVE:
+ //check whether there is mem_viol
+- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+- s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+ s_cbranch_scc0 L_NO_PC_REWIND
+-
++
+ //if so, need rewind PC assuming GDS operation gets NACKed
+- s_mov_b32 s_save_tmp, 0 //clear mem_viol bit
+- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
+- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+- s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
+- s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc
++ s_mov_b32 s_save_tmp, 0
++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit
++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
++ s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8
++ s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0
+
+ L_NO_PC_REWIND:
+- s_mov_b32 s_save_tmp, 0 //clear saveCtx bit
+- s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
+-
+- //s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK
+- //s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi
+- s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
+- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT
+- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY
+- s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+- s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
+- s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS
+- s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
++ s_mov_b32 s_save_tmp, 0
++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit
++
++ s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp
++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
++ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp
+-
+- /* inform SPI the readiness and wait for SPI's go signal */
+- s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
+- s_mov_b32 s_save_exec_hi, exec_hi
+- s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
+- if (EMU_RUN_HACK)
+-
+- else
+- s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
+- end
+-
+- L_SLEEP:
+- s_sleep 0x2
+-
+- if (EMU_RUN_HACK)
+-
+- else
+- s_cbranch_execz L_SLEEP
+- end
+-
+-
+- /* setup Resource Contants */
+- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+- //calculate wd_addr using absolute thread id
+- v_readlane_b32 s_save_tmp, v9, 0
+- //determine it is wave32 or wave64
+- s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+- s_cmp_eq_u32 s_wave_size, 0
+- s_cbranch_scc1 L_SAVE_WAVE32
+- s_lshr_b32 s_save_tmp, s_save_tmp, 6 //SAVE WAVE64
+- s_branch L_SAVE_CON
+- L_SAVE_WAVE32:
+- s_lshr_b32 s_save_tmp, s_save_tmp, 5 //SAVE WAVE32
+- L_SAVE_CON:
+- s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+- else
+- end
+- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+- s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+- s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+- s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+- else
+- end
+-
+-
+- s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
+- s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
+- s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+- s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+- s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
+- s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+- s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+- s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
+-
+- s_mov_b32 s_save_m0, m0 //save M0
+-
+- /* global mem offset */
+- s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0
+- s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //get wave_save_size
+- s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi
+-
+- /* save VGPRs */
+- //////////////////////////////
+- L_SAVE_VGPR:
+-
+- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
+- s_mov_b32 exec_hi, 0x00000000
+- s_branch L_SAVE_VGPR_NORMAL
+- L_ENABLE_SAVE_VGPR_EXEC_HI:
+- s_mov_b32 exec_hi, 0xFFFFFFFF
+- L_SAVE_VGPR_NORMAL:
+- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+- //for wave32 and wave64, the num of vgpr function is the same?
+- s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible
+- //determine it is wave32 or wave64
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_SAVE_VGPR_WAVE64
+-
+- //zhenxu added it for save vgpr for wave32
+- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_mov_b32 m0, 0x0 //VGPR initial index value =0
+- //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+- //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
+-
+- L_SAVE_VGPR_WAVE32_LOOP:
+- v_movrels_b32 v0, v0 //v0 = v[0+m0]
+-
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+- end
+-
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 128 bytes
+- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_VGPR_WAVE32_LOOP //VGPR save is complete?
+- s_branch L_SAVE_LDS
+- //save vgpr for wave32 ends
+-
+- L_SAVE_VGPR_WAVE64:
+- s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_mov_b32 m0, 0x0 //VGPR initial index value =0
+- //s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+- //s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later, doesn't need this in gfx10
+-
+- L_SAVE_VGPR_WAVE64_LOOP:
+- v_movrels_b32 v0, v0 //v0 = v[0+m0]
+-
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+- end
+-
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes
+- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_VGPR_WAVE64_LOOP //VGPR save is complete?
+- //s_set_gpr_idx_off
+- //
+- //Below part will be the save shared vgpr part (new for gfx10)
+- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
+- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
+- s_cbranch_scc0 L_SAVE_LDS //no shared_vgpr used? jump to L_SAVE_LDS
+- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
+- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+- //save shared_vgpr will start from the index of m0
+- s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
+- s_mov_b32 exec_lo, 0xFFFFFFFF
+- s_mov_b32 exec_hi, 0x00000000
+- L_SAVE_SHARED_VGPR_WAVE64_LOOP:
+- v_movrels_b32 v0, v0 //v0 = v[0+m0]
+- buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //every buffer_store_dword does 256 bytes
+- s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
+-
+- /* save LDS */
+- //////////////////////////////
+- L_SAVE_LDS:
+-
+- //Only check the first wave need LDS
+- /* the first wave in the threadgroup */
+- s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG"
+- s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here
+- s_cbranch_scc0 L_SAVE_SGPR
+-
+- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
+- s_mov_b32 exec_hi, 0x00000000
+- s_branch L_SAVE_LDS_NORMAL
+- L_ENABLE_SAVE_LDS_EXEC_HI:
+- s_mov_b32 exec_hi, 0xFFFFFFFF
+- L_SAVE_LDS_NORMAL:
+- s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+- s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
+- s_cbranch_scc0 L_SAVE_SGPR //no lds used? jump to L_SAVE_VGPR
+- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+- s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
+- s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- //load 0~63*4(byte address) to vgpr v15
+- v_mbcnt_lo_u32_b32 v0, -1, 0
+- v_mbcnt_hi_u32_b32 v0, -1, v0
+- v_mul_u32_u24 v0, 4, v0
+-
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_mov_b32 m0, 0x0
+- s_cbranch_scc1 L_SAVE_LDS_LOOP_W64
+-
+- L_SAVE_LDS_LOOP_W32:
+- if (SAVE_LDS)
+- ds_read_b32 v1, v0
+- s_waitcnt 0 //ensure data ready
+- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+- //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
+- end
+- s_add_u32 m0, m0, 128 //every buffer_store_lds does 128 bytes
+- s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 //mem offset increased by 128 bytes
+- v_add_nc_u32 v0, v0, 128
+- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
+- s_branch L_SAVE_SGPR
+-
+- L_SAVE_LDS_LOOP_W64:
+- if (SAVE_LDS)
+- ds_read_b32 v1, v0
+- s_waitcnt 0 //ensure data ready
+- buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+- //buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 //save lds to memory doesn't exist in 10
+- end
+- s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes
+- s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes
+- v_add_nc_u32 v0, v0, 256
+- s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
+-
+-
+- /* save SGPRs */
+- //////////////////////////////
+- //s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+- //s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
+- //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+- //s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //In gfx10, Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
+- L_SAVE_SGPR:
+- //need to look at it is wave32 or wave64
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_SAVE_SGPR_VMEM_WAVE64
+- if (SGPR_SAVE_USE_SQC)
+- s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+- else
+- s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
+- end
+- s_branch L_SAVE_SGPR_CONT
+- L_SAVE_SGPR_VMEM_WAVE64:
+- if (SGPR_SAVE_USE_SQC)
+- s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+- else
+- s_lshl_b32 s_save_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
+- end
+- L_SAVE_SGPR_CONT:
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- //s_mov_b32 m0, 0x0 //SGPR initial index value =0
+- //s_nop 0x0 //Manually inserted wait states
+-
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+-
+- s_mov_b32 m0, 0x0 //SGPR initial index value =0
+- s_nop 0x0 //Manually inserted wait states
+-
+- s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64
+-
+- L_SAVE_SGPR_LOOP_WAVE32:
+- s_movrels_b32 s0, s0 //s0 = s[0+m0]
+- //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
+- write_sgpr_to_mem_wave32(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
+- s_add_u32 m0, m0, 1 //next sgpr index
+- s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE32 //SGPR save is complete?
+- s_branch L_SAVE_HWREG
+-
+- L_SAVE_SGPR_LOOP_WAVE64:
+- s_movrels_b32 s0, s0 //s0 = s[0+m0]
+- //zhenxu, adding one more argument to save sgpr function, this is only for vmem, using sqc is not change
+- write_sgpr_to_mem_wave64(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4
+- s_add_u32 m0, m0, 1 //next sgpr index
+- s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_sgpr_save_num) ? 1 : 0
+- s_cbranch_scc1 L_SAVE_SGPR_LOOP_WAVE64 //SGPR save is complete?
+-
+-
+- /* save HW registers */
+- //////////////////////////////
+- L_SAVE_HWREG:
+- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_SAVE_HWREG_WAVE64
+-
+- write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
+-
+- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+- end
+-
+- write_sgpr_to_mem_wave32(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
+- write_sgpr_to_mem_wave32(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- write_sgpr_to_mem_wave32(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
+- write_sgpr_to_mem_wave32(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- write_sgpr_to_mem_wave32(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
+-
+- //s_save_trapsts conflicts with s_save_alloc_size
+- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+- write_sgpr_to_mem_wave32(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
+-
+- //write_sgpr_to_mem_wave32(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
+- write_sgpr_to_mem_wave32(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
+-
+- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+- write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- if(SAVE_RESTORE_HWID_DDID)
+- s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
+- write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- end
+- s_branch L_S_PGM_END_SAVED
+-
+- L_SAVE_HWREG_WAVE64:
+- write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0
+-
+- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+- end
+-
+- write_sgpr_to_mem_wave64(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC
+- write_sgpr_to_mem_wave64(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- write_sgpr_to_mem_wave64(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC
+- write_sgpr_to_mem_wave64(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+- write_sgpr_to_mem_wave64(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS
+-
+- //s_save_trapsts conflicts with s_save_alloc_size
+- s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+- write_sgpr_to_mem_wave64(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS
+-
+- //write_sgpr_to_mem_wave64(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO
+- write_sgpr_to_mem_wave64(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI
+-
+- //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+- s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE
+- write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+-
+-
+- if(SAVE_RESTORE_HWID_DDID)
+- s_getreg_b32 s_save_m0, hwreg(HW_REG_HW_ID1) //HW_ID1, handler records the SE/SA/WGP/SIMD/wave of the original wave
+- write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+-
+- /* save DDID */
+- //////////////////////////////
+- L_SAVE_DDID:
+- //EXEC has been saved, no vector inst following
+- s_mov_b32 exec_lo, 0x80000000 //Set MSB to 1. Cleared when draw index is returned
+- s_sendmsg sendmsg(MSG_GET_DDID)
+-
+- L_WAIT_DDID_LOOP:
+- s_nop 7 // sleep a bit
+- s_bitcmp0_b32 exec_lo, 31 // test to see if MSB is cleared, meaning done
+- s_cbranch_scc0 L_WAIT_DDID_LOOP
+-
+- s_mov_b32 s_save_m0, exec_lo
+-
+-
+- s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_SAVE_DDID_WAVE64
+-
+- write_sgpr_to_mem_wave32(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+-
+- L_SAVE_DDID_WAVE64:
+- write_sgpr_to_mem_wave64(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF)
+-
+- end
+-
+- L_S_PGM_END_SAVED:
+- /* S_PGM_END_SAVED */ //FIXME graphics ONLY
+- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+- s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32]
+- s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4
+- s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over
+- s_rfe_b64 s_save_pc_lo //Return to the main shader program
+- else
+- end
+-
+-
+- s_branch L_END_PGM
+-
+-
+-
+-/**************************************************************************/
+-/* restore routine */
+-/**************************************************************************/
++
++ /* inform SPI the readiness and wait for SPI's go signal */
++ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI
++ s_mov_b32 s_save_exec_hi, exec_hi
++ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive
++
++ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC
++
++L_SLEEP:
++ // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
++ // SQ hang, since the 7,8th wave could not get arbit to exec inst, while
++ // other waves are stuck into the sleep-loop and waiting for wrexec!=0
++ s_sleep 0x2
++ s_cbranch_execz L_SLEEP
++
++ /* setup Resource Contants */
++ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo
++ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi
++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
++ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
++ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC
++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE
++
++ s_mov_b32 s_save_m0, m0
++
++ /* global mem offset */
++ s_mov_b32 s_save_mem_offset, 0x0
++ s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
++ s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE
++ s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25
++
++ /* save HW registers */
++
++L_SAVE_HWREG:
++ // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
++ get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
++ get_svgpr_size_bytes(s_save_tmp)
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
++
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
++
++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS)
++ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
++ write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
++
++ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE)
++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
++
++ /* the first wave in the threadgroup */
++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
++ s_mov_b32 s_save_exec_hi, 0x0
++ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26]
++
++ /* save SGPRs */
++ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
++
++ // SGPR SR memory offset : size(VGPR)+size(SVGPR)
++ get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
++ get_svgpr_size_bytes(s_save_tmp)
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
++ s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0
++ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
++ s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
++
++ s_mov_b32 m0, 0x0 //SGPR initial index value =0
++ s_nop 0x0 //Manually inserted wait states
++L_SAVE_SGPR_LOOP:
++ // SGPR is allocated in 16 SGPR granularity
++ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
++ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
++ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
++ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
++ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
++ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0]
++ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0]
++ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0]
++
++ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
++ s_add_u32 m0, m0, 16 //next sgpr index
++ s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete?
++
++ //save the rest 10 SGPR
++ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0]
++ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0]
++ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0]
++ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0]
++ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0]
++ write_10sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
++
++ // restore s_save_buf_rsrc0,1
++ s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask
++
++ /* save first 4 VGPR, then LDS save could use */
++ // each wave will alloc 4 vgprs at least...
++
++ s_mov_b32 s_save_mem_offset, 0
++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI
++ s_mov_b32 exec_hi, 0x00000000
++ s_branch L_SAVE_4VGPR_WAVE32
++L_ENABLE_SAVE_4VGPR_EXEC_HI:
++ s_mov_b32 exec_hi, 0xFFFFFFFF
++ s_branch L_SAVE_4VGPR_WAVE64
++L_SAVE_4VGPR_WAVE32:
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR Allocated in 4-GPR granularity
++
++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
++ s_branch L_SAVE_LDS
++
++L_SAVE_4VGPR_WAVE64:
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR Allocated in 4-GPR granularity
++
++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
++
++ /* save LDS */
++
++L_SAVE_LDS:
++ // Change EXEC to all threads...
++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI
++ s_mov_b32 exec_hi, 0x00000000
++ s_branch L_SAVE_LDS_NORMAL
++L_ENABLE_SAVE_LDS_EXEC_HI:
++ s_mov_b32 exec_hi, 0xFFFFFFFF
++L_SAVE_LDS_NORMAL:
++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
++ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero?
++ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE
++
++ s_barrier //LDS is used? wait for other waves in the same TG
++ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
++ s_cbranch_scc0 L_SAVE_LDS_DONE
++
++ // first wave do LDS save;
++
++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes
++ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes
++
++ // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
++ //
++ get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
++ get_svgpr_size_bytes(s_save_tmp)
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
++
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ //load 0~63*4(byte address) to vgpr v0
++ v_mbcnt_lo_u32_b32 v0, -1, 0
++ v_mbcnt_hi_u32_b32 v0, -1, v0
++ v_mul_u32_u24 v0, 4, v0
++
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_mov_b32 m0, 0x0
++ s_cbranch_scc1 L_SAVE_LDS_W64
++
++L_SAVE_LDS_W32:
++ s_mov_b32 s3, 128
++ s_nop 0
++ s_nop 0
++ s_nop 0
++L_SAVE_LDS_LOOP_W32:
++ ds_read_b32 v1, v0
++ s_waitcnt 0
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++
++ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
++ v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes
++ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete?
++
++ s_branch L_SAVE_LDS_DONE
++
++L_SAVE_LDS_W64:
++ s_mov_b32 s3, 256
++ s_nop 0
++ s_nop 0
++ s_nop 0
++L_SAVE_LDS_LOOP_W64:
++ ds_read_b32 v1, v0
++ s_waitcnt 0
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++
++ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3
++ v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes
++ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete?
++
++L_SAVE_LDS_DONE:
++ /* save VGPRs - set the Rest VGPRs */
++L_SAVE_VGPR:
++ // VGPR SR memory offset: 0
++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI
++ s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs
++ s_mov_b32 exec_hi, 0x00000000
++ s_branch L_SAVE_VGPR_NORMAL
++L_ENABLE_SAVE_VGPR_EXEC_HI:
++ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs
++ s_mov_b32 exec_hi, 0xFFFFFFFF
++L_SAVE_VGPR_NORMAL:
++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1
++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
++ //determine it is wave32 or wave64
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_SAVE_VGPR_WAVE64
++
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR Allocated in 4-GPR granularity
++
++ // VGPR store using dw burst
++ s_mov_b32 m0, 0x4 //VGPR initial index value =4
++ s_cmp_lt_u32 m0, s_save_alloc_size
++ s_cbranch_scc0 L_SAVE_VGPR_END
++
++L_SAVE_VGPR_W32_LOOP:
++ v_movrels_b32 v0, v0 //v0 = v[0+m0]
++ v_movrels_b32 v1, v1 //v1 = v[1+m0]
++ v_movrels_b32 v2, v2 //v2 = v[2+m0]
++ v_movrels_b32 v3, v3 //v3 = v[3+m0]
++
++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
++
++ s_add_u32 m0, m0, 4 //next vgpr index
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes
++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete?
++
++ s_branch L_SAVE_VGPR_END
++
++L_SAVE_VGPR_WAVE64:
++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR store using dw burst
++ s_mov_b32 m0, 0x4 //VGPR initial index value =4
++ s_cmp_lt_u32 m0, s_save_alloc_size
++ s_cbranch_scc0 L_SAVE_VGPR_END
++
++L_SAVE_VGPR_W64_LOOP:
++ v_movrels_b32 v0, v0 //v0 = v[0+m0]
++ v_movrels_b32 v1, v1 //v1 = v[1+m0]
++ v_movrels_b32 v2, v2 //v2 = v[2+m0]
++ v_movrels_b32 v3, v3 //v3 = v[3+m0]
++
++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
++
++ s_add_u32 m0, m0, 4 //next vgpr index
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes
++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete?
++
++ //Below part will be the save shared vgpr part (new for gfx10)
++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
++ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
++ s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS
++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
++ //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
++ //save shared_vgpr will start from the index of m0
++ s_add_u32 s_save_alloc_size, s_save_alloc_size, m0
++ s_mov_b32 exec_lo, 0xFFFFFFFF
++ s_mov_b32 exec_hi, 0x00000000
++L_SAVE_SHARED_VGPR_WAVE64_LOOP:
++ v_movrels_b32 v0, v0 //v0 = v[0+m0]
++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
++ s_add_u32 m0, m0, 1 //next vgpr index
++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 128
++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete?
++
++L_SAVE_VGPR_END:
++ s_branch L_END_PGM
+
+ L_RESTORE:
+- /* Setup Resource Contants */
+- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+- //calculate wd_addr using absolute thread id
+- v_readlane_b32 s_restore_tmp, v9, 0
+- //determine it is wave32 or wave64
+- s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) //change to ttmp13
+- s_cmp_eq_u32 s_restore_size, 0
+- s_cbranch_scc1 L_RESTORE_WAVE32
+- s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 //SAVE WAVE64
+- s_branch L_RESTORE_CON
+- L_RESTORE_WAVE32:
+- s_lshr_b32 s_restore_tmp, s_restore_tmp, 5 //SAVE WAVE32
+- L_RESTORE_CON:
+- s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+- s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+- s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+- s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+- else
+- end
+-
+- s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
+- s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
+- s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+- s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
+- s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position
+- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
+- s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+- s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position
+- s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
+- //determine it is wave32 or wave64
+- s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+- s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size //share s_wave_size with exec_hi
+-
+- /* global mem offset */
+- s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0
+-
+- /* restore VGPRs */
+- //////////////////////////////
+- L_RESTORE_VGPR:
+-
+- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
+- s_mov_b32 exec_hi, 0x00000000
+- s_branch L_RESTORE_VGPR_NORMAL
+- L_ENABLE_RESTORE_VGPR_EXEC_HI:
+- s_mov_b32 exec_hi, 0xFFFFFFFF
+- L_RESTORE_VGPR_NORMAL:
+- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size
+- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
+- //determine it is wave32 or wave64
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
+-
+- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 7 //NUM_RECORDS in bytes (32 threads*4)
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
+- s_mov_b32 m0, 1 //VGPR initial index value = 1
+- //s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+- //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later, might not need this in gfx10
+-
+- L_RESTORE_VGPR_WAVE32_LOOP:
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+- end
+- s_waitcnt vmcnt(0) //ensure data ready
+- v_movreld_b32 v0, v0 //v[0+m0] = v0
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 128 bytes
+- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
+- //s_set_gpr_idx_off
+- /* VGPR restore on v0 */
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+- end
+-
+- s_branch L_RESTORE_LDS
+-
+- L_RESTORE_VGPR_WAVE64:
+- s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4)
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256
+- s_mov_b32 m0, 1 //VGPR initial index value = 1
+- L_RESTORE_VGPR_WAVE64_LOOP:
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+- end
+- s_waitcnt vmcnt(0) //ensure data ready
+- v_movreld_b32 v0, v0 //v[0+m0] = v0
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes
+- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
+- //s_set_gpr_idx_off
+- //
+- //Below part will be the restore shared vgpr part (new for gfx10)
+- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
+- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
+- s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? jump to L_SAVE_LDS
+- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
+- //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+- //restore shared_vgpr will start from the index of m0
+- s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
+- s_mov_b32 exec_lo, 0xFFFFFFFF
+- s_mov_b32 exec_hi, 0x00000000
+- L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+- s_waitcnt vmcnt(0) //ensure data ready
+- v_movreld_b32 v0, v0 //v[0+m0] = v0
+- s_add_u32 m0, m0, 1 //next vgpr index
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //every buffer_load_dword does 256 bytes
+- s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
+-
+- s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
+-
+- /* VGPR restore on v0 */
+- L_RESTORE_V0:
+- if(USE_MTBUF_INSTEAD_OF_MUBUF)
+- tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- else
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+- end
+-
+-
+- /* restore LDS */
+- //////////////////////////////
+- L_RESTORE_LDS:
+-
+- //Only need to check the first wave
+- /* the first wave in the threadgroup */
+- s_and_b32 s_restore_tmp, s_restore_size, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+- s_cbranch_scc0 L_RESTORE_SGPR
+-
+- s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
+- s_mov_b32 exec_hi, 0x00000000
+- s_branch L_RESTORE_LDS_NORMAL
+- L_ENABLE_RESTORE_LDS_EXEC_HI:
+- s_mov_b32 exec_hi, 0xFFFFFFFF
+- L_RESTORE_LDS_NORMAL:
+- s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size
+- s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
+- s_cbranch_scc0 L_RESTORE_SGPR //no lds used? jump to L_RESTORE_VGPR
+- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
+- s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
+- s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_and_b32 m0, s_wave_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_mov_b32 m0, 0x0
+- s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
+-
+- L_RESTORE_LDS_LOOP_W32:
+- if (SAVE_LDS)
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
+- s_waitcnt 0
+- end
+- s_add_u32 m0, m0, 128 //every buffer_load_dword does 256 bytes
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 256 bytes
+- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
+- s_branch L_RESTORE_SGPR
+-
+- L_RESTORE_LDS_LOOP_W64:
+- if (SAVE_LDS)
+- buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1
+- s_waitcnt 0
+- end
+- s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes
+- s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes
+- s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
+-
+-
+- /* restore SGPRs */
+- //////////////////////////////
+- //s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size
+- //s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
+- //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value)
+- //s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SGPRs = (sgpr_size + 1) * 8 (non-zero value)
+- L_RESTORE_SGPR:
+- //need to look at it is wave32 or wave64
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_RESTORE_SGPR_VMEM_WAVE64
+- if (SGPR_SAVE_USE_SQC)
+- s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+- else
+- s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 7 //NUM_RECORDS in bytes (32 threads)
+- end
+- s_branch L_RESTORE_SGPR_CONT
+- L_RESTORE_SGPR_VMEM_WAVE64:
+- if (SGPR_SAVE_USE_SQC)
+- s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 2 //NUM_RECORDS in bytes
+- else
+- s_lshl_b32 s_restore_buf_rsrc2, s_sgpr_save_num, 8 //NUM_RECORDS in bytes (64 threads)
+- end
+-
+- L_RESTORE_SGPR_CONT:
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_RESTORE_SGPR_WAVE64
+-
+- read_sgpr_from_mem_wave32(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
+- s_mov_b32 m0, 0x1
+-
+- L_RESTORE_SGPR_LOOP_WAVE32:
+- read_sgpr_from_mem_wave32(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
+- s_waitcnt lgkmcnt(0) //ensure data ready
+- s_movreld_b32 s0, s0 //s[0+m0] = s0
+- s_nop 0 // hazard SALU M0=> S_MOVREL
+- s_add_u32 m0, m0, 1 //next sgpr index
+- s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE32 //SGPR restore (except s0) is complete?
+- s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
+- s_branch L_RESTORE_HWREG
+-
+- L_RESTORE_SGPR_WAVE64:
+- read_sgpr_from_mem_wave64(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp
+- s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1
+-
+- L_RESTORE_SGPR_LOOP_WAVE64:
+- read_sgpr_from_mem_wave64(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made
+- s_waitcnt lgkmcnt(0) //ensure data ready
+- s_movreld_b32 s0, s0 //s[0+m0] = s0
+- s_nop 0 // hazard SALU M0=> S_MOVREL
+- s_add_u32 m0, m0, 1 //next sgpr index
+- s_cmp_lt_u32 m0, s_sgpr_save_num //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+- s_cbranch_scc1 L_RESTORE_SGPR_LOOP_WAVE64 //SGPR restore (except s0) is complete?
+- s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */
+-
+-
+- /* restore HW registers */
+- //////////////////////////////
+- L_RESTORE_HWREG:
+- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_RESTORE_HWREG_WAVE64
+-
+- read_sgpr_from_mem_wave32(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
+- read_sgpr_from_mem_wave32(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
+- read_sgpr_from_mem_wave32(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+- read_sgpr_from_mem_wave32(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
+- read_sgpr_from_mem_wave32(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+- read_sgpr_from_mem_wave32(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
+- read_sgpr_from_mem_wave32(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
+- //read_sgpr_from_mem_wave32(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
+- //read_sgpr_from_mem_wave32(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
+- read_sgpr_from_mem_wave32(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
+- read_sgpr_from_mem_wave32(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
+- if(SAVE_RESTORE_HWID_DDID)
+- read_sgpr_from_mem_wave32(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
+- end
+- s_branch L_RESTORE_HWREG_FINISH
+-
+- L_RESTORE_HWREG_WAVE64:
+- read_sgpr_from_mem_wave64(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0
+- read_sgpr_from_mem_wave64(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC
+- read_sgpr_from_mem_wave64(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+- read_sgpr_from_mem_wave64(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC
+- read_sgpr_from_mem_wave64(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+- read_sgpr_from_mem_wave64(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS
+- read_sgpr_from_mem_wave64(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS
+- //read_sgpr_from_mem_wave64(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO
+- //read_sgpr_from_mem_wave64(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI
+- read_sgpr_from_mem_wave64(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK
+- read_sgpr_from_mem_wave64(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE
+- if(SAVE_RESTORE_HWID_DDID)
+- read_sgpr_from_mem_wave64(s_restore_hwid1, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //HW_ID1
+- end
+- L_RESTORE_HWREG_FINISH:
+- s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
+-
+-
+-
+- if(SAVE_RESTORE_HWID_DDID)
+- L_RESTORE_DDID:
+- s_mov_b32 m0, s_restore_hwid1 //virture ttrace support: The save-context handler records the SE/SA/WGP/SIMD/wave of the original wave
+- s_ttracedata //and then can output it as SHADER_DATA to ttrace on restore to provide a correlation across the save-restore
+-
+- s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes
+- if (SWIZZLE_EN)
+- s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking?
+- else
+- s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
+- end
+-
+- s_and_b32 m0, s_restore_size, 1
+- s_cmp_eq_u32 m0, 1
+- s_cbranch_scc1 L_RESTORE_DDID_WAVE64
+-
+- read_sgpr_from_mem_wave32(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+- s_branch L_RESTORE_DDID_FINISH
+- L_RESTORE_DDID_WAVE64:
+- read_sgpr_from_mem_wave64(s_restore_ddid, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC)
+-
+- L_RESTORE_DDID_FINISH:
+- s_waitcnt lgkmcnt(0)
+- //s_mov_b32 m0, s_restore_ddid
+- //s_ttracedata
+- if (RESTORE_DDID_IN_SGPR18)
+- s_mov_b32 s18, s_restore_ddid
+- end
+-
+- end
+-
+- s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
+-
+- //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+- if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore)
+- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+- end
+- if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+- s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal
+- s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over
+- end
+-
+- s_mov_b32 m0, s_restore_m0
+- s_mov_b32 exec_lo, s_restore_exec_lo
+- s_mov_b32 exec_hi, s_restore_exec_hi
+-
+- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
++ /* Setup Resource Contants */
++ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo
++ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi
++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
++ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes)
++ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC
++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE
++ //determine it is wave32 or wave64
++ s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
++ s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE
++ s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size
++
++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
++ s_cbranch_scc0 L_RESTORE_VGPR
++
++ /* restore LDS */
++L_RESTORE_LDS:
++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
++ s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI
++ s_mov_b32 exec_hi, 0x00000000
++ s_branch L_RESTORE_LDS_NORMAL
++L_ENABLE_RESTORE_LDS_EXEC_HI:
++ s_mov_b32 exec_hi, 0xFFFFFFFF
++L_RESTORE_LDS_NORMAL:
++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
++ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero?
++ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR
++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw
++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes
++ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes
++
++ // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
++ //
++ get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
++ get_svgpr_size_bytes(s_restore_tmp)
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
++
++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_mov_b32 m0, 0x0
++ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64
++
++L_RESTORE_LDS_LOOP_W32:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
++ s_add_u32 m0, m0, 128 // 128 DW
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW
++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete?
++ s_branch L_RESTORE_VGPR
++
++L_RESTORE_LDS_LOOP_W64:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW
++ s_add_u32 m0, m0, 256 // 256 DW
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW
++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete?
++
++ /* restore VGPRs */
++L_RESTORE_VGPR:
++ // VGPR SR memory offset : 0
++ s_mov_b32 s_restore_mem_offset, 0x0
++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on
++ s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI
++ s_mov_b32 exec_hi, 0x00000000
++ s_branch L_RESTORE_VGPR_NORMAL
++L_ENABLE_RESTORE_VGPR_EXEC_HI:
++ s_mov_b32 exec_hi, 0xFFFFFFFF
++L_RESTORE_VGPR_NORMAL:
++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1
++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value)
++ //determine it is wave32 or wave64
++ s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_RESTORE_VGPR_WAVE64
++
++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR load using dw burst
++ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4
++ s_mov_b32 m0, 4 //VGPR initial index value = 4
++
++L_RESTORE_VGPR_WAVE32_LOOP:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
++ s_waitcnt vmcnt(0)
++ v_movreld_b32 v0, v0 //v[0+m0] = v0
++ v_movreld_b32 v1, v1
++ v_movreld_b32 v2, v2
++ v_movreld_b32 v3, v3
++ s_add_u32 m0, m0, 4 //next vgpr index
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes
++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete?
++
++ /* VGPR restore on v0 */
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
++
++ s_branch L_RESTORE_SGPR
++
++L_RESTORE_VGPR_WAVE64:
++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ // VGPR load using dw burst
++ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4
++ s_mov_b32 m0, 4 //VGPR initial index value = 4
++
++L_RESTORE_VGPR_WAVE64_LOOP:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
++ s_waitcnt vmcnt(0)
++ v_movreld_b32 v0, v0 //v[0+m0] = v0
++ v_movreld_b32 v1, v1
++ v_movreld_b32 v2, v2
++ v_movreld_b32 v3, v3
++ s_add_u32 m0, m0, 4 //next vgpr index
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes
++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
++
++ //Below part will be the restore shared vgpr part (new for gfx10)
++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size
++ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero?
++ s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used?
++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value)
++ //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
++ //restore shared_vgpr will start from the index of m0
++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0
++ s_mov_b32 exec_lo, 0xFFFFFFFF
++ s_mov_b32 exec_hi, 0x00000000
++L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
++ s_waitcnt vmcnt(0)
++ v_movreld_b32 v0, v0 //v[0+m0] = v0
++ s_add_u32 m0, m0, 1 //next vgpr index
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128
++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0
++ s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete?
++
++ s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!!
++
++ /* VGPR restore on v0 */
++L_RESTORE_V0:
++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
++
++ /* restore SGPRs */
++ //will be 2+8+16*6
++ // SGPR SR memory offset : size(VGPR)+size(SVGPR)
++L_RESTORE_SGPR:
++ get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
++ get_svgpr_size_bytes(s_restore_tmp)
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 22*4 //s106~s127 is not saved
++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 2*4 // restore SGPR from S[n] to S[0], by 2 sgprs group
++
++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ s_mov_b32 m0, s_sgpr_save_num
++
++ read_2sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
++
++ s_waitcnt lgkmcnt(0)
++
++ s_sub_u32 m0, m0, 2 // Restore from S[n] to S[0]
++ s_nop 0 // hazard SALU M0=> S_MOVREL
++
++ s_movreld_b64 s0, s0 //s[0+m0] = s0
++
++ read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
++ s_waitcnt lgkmcnt(0)
++
++ s_sub_u32 m0, m0, 8 // Restore from S[n] to S[0]
++ s_nop 0 // hazard SALU M0=> S_MOVREL
++
++ s_movreld_b64 s0, s0 //s[0+m0] = s0
++ s_movreld_b64 s2, s2
++ s_movreld_b64 s4, s4
++ s_movreld_b64 s6, s6
++
++ L_RESTORE_SGPR_LOOP:
++ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
++ s_waitcnt lgkmcnt(0)
++
++ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0]
++ s_nop 0 // hazard SALU M0=> S_MOVREL
++
++ s_movreld_b64 s0, s0 //s[0+m0] = s0
++ s_movreld_b64 s2, s2
++ s_movreld_b64 s4, s4
++ s_movreld_b64 s6, s6
++ s_movreld_b64 s8, s8
++ s_movreld_b64 s10, s10
++ s_movreld_b64 s12, s12
++ s_movreld_b64 s14, s14
++
++ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0
++ s_cbranch_scc0 L_RESTORE_SGPR_LOOP
++
++ /* restore HW registers */
++L_RESTORE_HWREG:
++ // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
++ get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
++ get_svgpr_size_bytes(s_restore_tmp)
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
++
++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes
++
++ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
++ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
++
++ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS
++
++ s_mov_b32 s_restore_tmp, s_restore_pc_hi
++ s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS
++
++ s_mov_b32 m0, s_restore_m0
++ s_mov_b32 exec_lo, s_restore_exec_lo
++ s_mov_b32 exec_hi, s_restore_exec_hi
++
++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+- s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask //restore xnack_mask
+- s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
++ s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+- //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+- s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
+- //reuse s_restore_m0 as a temp register
+- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+- s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero
+- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+- s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+- s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+- s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+- s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0
+- s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+- s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+- s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp
+- s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status
+-
+- s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+-
+-
+-// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
+- s_rfe_b64 s_restore_pc_lo // s_restore_m0[0] is used to set STATUS.inst_atc
+-
+-
+-/**************************************************************************/
+-/* the END */
+-/**************************************************************************/
+-L_END_PGM:
++ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode
++ s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
++ s_mov_b32 s_restore_mode, 0x0
++ s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
++ s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
++ s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
++ s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
++ s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0
++
++ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode
++
++ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
++ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32
++ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu
++
++ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
++
++ s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution
++
++L_END_PGM:
+ s_endpgm
+-
+-end
+-
+-
+-/**************************************************************************/
+-/* the helper functions */
+-/**************************************************************************/
+-function write_sgpr_to_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
+- if (use_sqc)
+- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+- s_mov_b32 m0, s_mem_offset
+- s_buffer_store_dword s, s_rsrc, m0 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 4
+- s_mov_b32 m0, exec_lo
+- elsif (use_mtbuf)
+- v_mov_b32 v0, s
+- tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 128
+- else
+- v_mov_b32 v0, s
+- buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 128
+- end
+ end
+
+-function write_sgpr_to_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf)
+- if (use_sqc)
+- s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on
+- s_mov_b32 m0, s_mem_offset
+- s_buffer_store_dword s, s_rsrc, m0 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 4
+- s_mov_b32 m0, exec_lo
+- elsif (use_mtbuf)
+- v_mov_b32 v0, s
+- tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 256
+- else
+- v_mov_b32 v0, s
+- buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1
+- s_add_u32 s_mem_offset, s_mem_offset, 256
+- end
++function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
++ s_mov_b32 exec_lo, m0
++ s_mov_b32 m0, s_mem_offset
++ s_buffer_store_dword s, s_rsrc, m0 glc:1
++ s_add_u32 s_mem_offset, s_mem_offset, 4
++ s_mov_b32 m0, exec_lo
++end
++
++
++function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
++ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
++ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1
++ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
++ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
++ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
++end
++
++function write_10sgpr_to_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1
++ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1
++ s_buffer_store_dwordx2 s[8], s_rsrc, 32 glc:1
++ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16
++ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0
++end
++
++
++function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
++ s_add_u32 s_mem_offset, s_mem_offset, 4
+ end
+
+-function read_sgpr_from_mem_wave32(s, s_rsrc, s_mem_offset, use_sqc)
+- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+- if (use_sqc)
+- s_add_u32 s_mem_offset, s_mem_offset, 4
+- else
+- s_add_u32 s_mem_offset, s_mem_offset, 128
+- end
++function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1
++ s_sub_u32 s_mem_offset, s_mem_offset, 4*16
+ end
+
+-function read_sgpr_from_mem_wave64(s, s_rsrc, s_mem_offset, use_sqc)
+- s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1
+- if (use_sqc)
+- s_add_u32 s_mem_offset, s_mem_offset, 4
+- else
+- s_add_u32 s_mem_offset, s_mem_offset, 256
+- end
++function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1
++ s_sub_u32 s_mem_offset, s_mem_offset, 4*16
+ end
+
++function read_2sgpr_from_mem(s, s_rsrc, s_mem_offset)
++ s_buffer_load_dwordx2 s, s_rsrc, s_mem_offset glc:1
++ s_sub_u32 s_mem_offset, s_mem_offset, 4*8
++end
++
++
++function get_lds_size_bytes(s_lds_size_byte)
++ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
++ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
++end
++
++function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
++ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
++ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1
++ s_lshr_b32 m0, s_size, S_WAVE_SIZE
++ s_and_b32 m0, m0, 1
++ s_cmp_eq_u32 m0, 1
++ s_cbranch_scc1 L_ENABLE_SHIFT_W64
++ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value)
++ s_branch L_SHIFT_DONE
++L_ENABLE_SHIFT_W64:
++ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value)
++L_SHIFT_DONE:
++end
++
++function get_svgpr_size_bytes(s_svgpr_size_byte)
++ s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
++ s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
++end
++
++function get_sgpr_size_bytes
++ return 512
++end
++
++function get_hwreg_size_bytes
++ return 128
++end
+--
+2.17.1
+