diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2775-drm-amdkfd-Use-SQC-when-TCP-would-fail-in-gfx9-conte.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2775-drm-amdkfd-Use-SQC-when-TCP-would-fail-in-gfx9-conte.patch | 572 |
1 files changed, 572 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2775-drm-amdkfd-Use-SQC-when-TCP-would-fail-in-gfx9-conte.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2775-drm-amdkfd-Use-SQC-when-TCP-would-fail-in-gfx9-conte.patch new file mode 100644 index 00000000..fbab67c8 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2775-drm-amdkfd-Use-SQC-when-TCP-would-fail-in-gfx9-conte.patch @@ -0,0 +1,572 @@ +From eb5266a857727d99e0fe697caf40bc1a3147813a Mon Sep 17 00:00:00 2001 +From: Jay Cornwall <Jay.Cornwall@amd.com> +Date: Mon, 29 Apr 2019 14:08:29 -0500 +Subject: [PATCH 2775/2940] drm/amdkfd: Use SQC when TCP would fail in gfx9 + context save. + +When a wavefront raises TRAPSTS.XNACK_ERROR with STATUS.ALLOW_REPLAY=0 +subsequent memory instructions have undefined behavior. In practice +SQC stores continue to work but TCP stores do not. + +Context save is permitted to fail after XNACK error because the +wavefront will be halted and subsequently terminated. However the +debugger has an interest in retrieving the wavefront VGPR/LDS state. + +Detect the out-of-spec case and use SQC stores during context save +in place of TCP stores. + +Change-Id: I98050e06282874197fcebda51480a2e931deb40c +Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com> +--- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 365 ++++++++++++------ + .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 81 ++++ + 2 files changed, 320 insertions(+), 126 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +index e413d4a71fa3..b0b982cd3f0d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +@@ -274,7 +274,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { + + + static const uint32_t cwsr_trap_gfx9_hex[] = { +- 0xbf820001, 0xbf82015e, ++ 0xbf820001, 0xbf820241, + 0xb8f8f802, 0x89788678, + 0xb8fbf803, 0x866eff7b, + 0x00000400, 0xbf85003b, +@@ -404,15 +404,57 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xbefe00c1, 0xbeff00c1, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, ++ 0x867aff78, 0x00400000, ++ 0xbf850003, 0xb8faf803, ++ 0x897a7aff, 0x10000000, ++ 0xbf85004d, 0xbe840080, ++ 0xd2890000, 0x00000900, ++ 0x80048104, 0xd2890001, ++ 0x00000900, 0x80048104, ++ 0xd2890002, 0x00000900, ++ 0x80048104, 0xd2890003, ++ 0x00000900, 0x80048104, ++ 0xc069003a, 0x00000070, ++ 0xbf8cc07f, 0x80709070, ++ 0xbf06c004, 0xbf84ffee, ++ 0xbe840080, 0xd2890000, ++ 0x00000901, 0x80048104, ++ 0xd2890001, 0x00000901, ++ 0x80048104, 0xd2890002, ++ 0x00000901, 0x80048104, ++ 0xd2890003, 0x00000901, ++ 0x80048104, 0xc069003a, ++ 0x00000070, 0xbf8cc07f, ++ 0x80709070, 0xbf06c004, ++ 0xbf84ffee, 0xbe840080, ++ 0xd2890000, 0x00000902, ++ 0x80048104, 0xd2890001, ++ 0x00000902, 0x80048104, ++ 0xd2890002, 0x00000902, ++ 0x80048104, 0xd2890003, ++ 0x00000902, 0x80048104, ++ 0xc069003a, 0x00000070, ++ 0xbf8cc07f, 0x80709070, ++ 0xbf06c004, 0xbf84ffee, ++ 0xbe840080, 0xd2890000, ++ 0x00000903, 0x80048104, ++ 0xd2890001, 0x00000903, ++ 0x80048104, 0xd2890002, ++ 0x00000903, 0x80048104, ++ 0xd2890003, 0x00000903, ++ 0x80048104, 0xc069003a, ++ 0x00000070, 0xbf8cc07f, ++ 0x80709070, 0xbf06c004, ++ 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb4306, 0x867bc17b, +- 0xbf84002c, 0xbf8a0000, ++ 0xbf840063, 0xbf8a0000, + 0x867aff6f, 0x04000000, +- 0xbf840028, 0x8e7b867b, ++ 0xbf84005f, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fa1605, +@@ -422,142 +464,213 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { + 0x01000000, 0xbefc0080, + 0xd28c0002, 0x000100c1, + 0xd28d0003, 0x000204c1, +- 0xd1060002, 0x00011103, +- 0x7e0602ff, 0x00000200, +- 0xbefc00ff, 0x00010000, +- 0xbe800077, 0x8677ff77, +- 0xff7fffff, 0x8777ff77, +- 0x00058000, 0xd8ec0000, +- 0x00000002, 0xbf8cc07f, +- 0xe0765000, 0x701d0002, +- 0x68040702, 0xd0c9006a, +- 0x0000f702, 0xbf87fff7, +- 0xbef70000, 0xbef000ff, +- 0x00000400, 0xbefe00c1, +- 0xbeff00c1, 0xb8fb2a05, +- 0x807b817b, 0x8e7b827b, +- 0x8e76887b, 0xbef600ff, +- 0x01000000, 0xbefc0084, +- 0xbf0a7b7c, 0xbf840015, +- 0xbf11017c, 0x807bff7b, +- 0x00001000, 0x7e000300, +- 0x7e020301, 0x7e040302, +- 0x7e060303, 0xe0724000, +- 0x701d0000, 0xe0724100, +- 0x701d0100, 0xe0724200, +- 0x701d0200, 0xe0724300, +- 0x701d0300, 0x807c847c, +- 0x8070ff70, 0x00000400, +- 0xbf0a7b7c, 0xbf85ffef, +- 0xbf9c0000, 0xbf8200da, +- 0xbef4007e, 0x8675ff7f, +- 0x0000ffff, 0x8775ff75, +- 0x00040000, 0xbef60080, +- 0xbef700ff, 0x00807fac, +- 0x866eff7f, 0x08000000, +- 0x8f6e836e, 0x87776e77, +- 0x866eff7f, 0x70000000, +- 0x8f6e816e, 0x87776e77, +- 0x866eff7f, 0x04000000, +- 0xbf84001e, 0xbefe00c1, +- 0xbeff00c1, 0xb8ef4306, +- 0x866fc16f, 0xbf840019, +- 0x8e6f866f, 0x8e6f826f, +- 0xbef6006f, 0xb8f82a05, +- 0x80788178, 0x8e788a78, +- 0xb8ee1605, 0x806e816e, +- 0x8e6e866e, 0x80786e78, +- 0x8078ff78, 0x00000080, +- 0xbef600ff, 0x01000000, +- 0xbefc0080, 0xe0510000, +- 0x781d0000, 0xe0510100, +- 0x781d0000, 0x807cff7c, +- 0x00000200, 0x8078ff78, +- 0x00000200, 0xbf0a6f7c, +- 0xbf85fff6, 0xbef80080, ++ 0x867aff78, 0x00400000, ++ 0xbf850003, 0xb8faf803, ++ 0x897a7aff, 0x10000000, ++ 0xbf850030, 0x24040682, ++ 0xd86e4000, 0x00000002, ++ 0xbf8cc07f, 0xbe840080, ++ 0xd2890000, 0x00000900, ++ 0x80048104, 0xd2890001, ++ 0x00000900, 0x80048104, ++ 0xd2890002, 0x00000900, ++ 0x80048104, 0xd2890003, ++ 0x00000900, 0x80048104, ++ 0xc069003a, 0x00000070, ++ 0xbf8cc07f, 0x80709070, ++ 0xbf06c004, 0xbf84ffee, ++ 0xbe840080, 0xd2890000, ++ 0x00000901, 0x80048104, ++ 0xd2890001, 0x00000901, ++ 0x80048104, 0xd2890002, ++ 0x00000901, 0x80048104, ++ 0xd2890003, 0x00000901, ++ 0x80048104, 0xc069003a, ++ 0x00000070, 0xbf8cc07f, ++ 0x80709070, 0xbf06c004, ++ 0xbf84ffee, 0x680404ff, ++ 0x00000200, 0xd0c9006a, ++ 0x0000f702, 0xbf87ffd2, ++ 0xbf820015, 0xd1060002, ++ 0x00011103, 0x7e0602ff, ++ 0x00000200, 0xbefc00ff, ++ 0x00010000, 0xbe800077, ++ 0x8677ff77, 0xff7fffff, ++ 0x8777ff77, 0x00058000, ++ 0xd8ec0000, 0x00000002, ++ 0xbf8cc07f, 0xe0765000, ++ 0x701d0002, 0x68040702, ++ 0xd0c9006a, 0x0000f702, ++ 0xbf87fff7, 0xbef70000, ++ 0xbef000ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, +- 0xb8ef2a05, 0x806f816f, +- 0x8e6f826f, 0x8e76886f, ++ 0xb8fb2a05, 0x807b817b, ++ 0x8e7b827b, 0x8e76887b, + 0xbef600ff, 0x01000000, +- 0xbeee0078, 0x8078ff78, +- 0x00000400, 0xbefc0084, +- 0xbf11087c, 0x806fff6f, +- 0x00008000, 0xe0524000, +- 0x781d0000, 0xe0524100, +- 0x781d0100, 0xe0524200, +- 0x781d0200, 0xe0524300, +- 0x781d0300, 0xbf8c0f70, ++ 0xbefc0084, 0xbf0a7b7c, ++ 0xbf84006d, 0xbf11017c, ++ 0x807bff7b, 0x00001000, ++ 0x867aff78, 0x00400000, ++ 0xbf850003, 0xb8faf803, ++ 0x897a7aff, 0x10000000, ++ 0xbf850051, 0xbe840080, ++ 0xd2890000, 0x00000900, ++ 0x80048104, 0xd2890001, ++ 0x00000900, 0x80048104, ++ 0xd2890002, 0x00000900, ++ 0x80048104, 0xd2890003, ++ 0x00000900, 0x80048104, ++ 0xc069003a, 0x00000070, ++ 0xbf8cc07f, 0x80709070, ++ 0xbf06c004, 0xbf84ffee, ++ 0xbe840080, 0xd2890000, ++ 0x00000901, 0x80048104, ++ 0xd2890001, 0x00000901, ++ 0x80048104, 0xd2890002, ++ 0x00000901, 0x80048104, ++ 0xd2890003, 0x00000901, ++ 0x80048104, 0xc069003a, ++ 0x00000070, 0xbf8cc07f, ++ 0x80709070, 0xbf06c004, ++ 0xbf84ffee, 0xbe840080, ++ 0xd2890000, 0x00000902, ++ 0x80048104, 0xd2890001, ++ 0x00000902, 0x80048104, ++ 0xd2890002, 0x00000902, ++ 0x80048104, 0xd2890003, ++ 0x00000902, 0x80048104, ++ 0xc069003a, 0x00000070, ++ 0xbf8cc07f, 0x80709070, ++ 0xbf06c004, 0xbf84ffee, ++ 0xbe840080, 0xd2890000, ++ 0x00000903, 0x80048104, ++ 0xd2890001, 0x00000903, ++ 0x80048104, 0xd2890002, ++ 0x00000903, 0x80048104, ++ 0xd2890003, 0x00000903, ++ 0x80048104, 0xc069003a, ++ 0x00000070, 0xbf8cc07f, ++ 0x80709070, 0xbf06c004, ++ 0xbf84ffee, 0x807c847c, ++ 0xbf0a7b7c, 0xbf85ffb1, ++ 0xbf9c0000, 0xbf820012, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, +- 0x807c847c, 0x8078ff78, +- 0x00000400, 0xbf0a6f7c, +- 0xbf85ffee, 0xbf9c0000, +- 0xe0524000, 0x6e1d0000, +- 0xe0524100, 0x6e1d0100, +- 0xe0524200, 0x6e1d0200, +- 0xe0524300, 0x6e1d0300, ++ 0xe0724000, 0x701d0000, ++ 0xe0724100, 0x701d0100, ++ 0xe0724200, 0x701d0200, ++ 0xe0724300, 0x701d0300, ++ 0x807c847c, 0x8070ff70, ++ 0x00000400, 0xbf0a7b7c, ++ 0xbf85ffef, 0xbf9c0000, ++ 0xbf8200da, 0xbef4007e, ++ 0x8675ff7f, 0x0000ffff, ++ 0x8775ff75, 0x00040000, ++ 0xbef60080, 0xbef700ff, ++ 0x00807fac, 0x866eff7f, ++ 0x08000000, 0x8f6e836e, ++ 0x87776e77, 0x866eff7f, ++ 0x70000000, 0x8f6e816e, ++ 0x87776e77, 0x866eff7f, ++ 0x04000000, 0xbf84001e, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8ef4306, 0x866fc16f, ++ 0xbf840019, 0x8e6f866f, ++ 0x8e6f826f, 0xbef6006f, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, +- 0x80786e78, 0x80f8c078, +- 0xb8ef1605, 0x806f816f, +- 0x8e6f846f, 0x8e76826f, +- 0xbef600ff, 0x01000000, +- 0xbefc006f, 0xc031003a, +- 0x00000078, 0x80f8c078, +- 0xbf8cc07f, 0x80fc907c, +- 0xbf800000, 0xbe802d00, +- 0xbe822d02, 0xbe842d04, +- 0xbe862d06, 0xbe882d08, +- 0xbe8a2d0a, 0xbe8c2d0c, +- 0xbe8e2d0e, 0xbf06807c, +- 0xbf84fff0, 0xb8f82a05, ++ 0x80786e78, 0x8078ff78, ++ 0x00000080, 0xbef600ff, ++ 0x01000000, 0xbefc0080, ++ 0xe0510000, 0x781d0000, ++ 0xe0510100, 0x781d0000, ++ 0x807cff7c, 0x00000200, ++ 0x8078ff78, 0x00000200, ++ 0xbf0a6f7c, 0xbf85fff6, ++ 0xbef80080, 0xbefe00c1, ++ 0xbeff00c1, 0xb8ef2a05, ++ 0x806f816f, 0x8e6f826f, ++ 0x8e76886f, 0xbef600ff, ++ 0x01000000, 0xbeee0078, ++ 0x8078ff78, 0x00000400, ++ 0xbefc0084, 0xbf11087c, ++ 0x806fff6f, 0x00008000, ++ 0xe0524000, 0x781d0000, ++ 0xe0524100, 0x781d0100, ++ 0xe0524200, 0x781d0200, ++ 0xe0524300, 0x781d0300, ++ 0xbf8c0f70, 0x7e000300, ++ 0x7e020301, 0x7e040302, ++ 0x7e060303, 0x807c847c, ++ 0x8078ff78, 0x00000400, ++ 0xbf0a6f7c, 0xbf85ffee, ++ 0xbf9c0000, 0xe0524000, ++ 0x6e1d0000, 0xe0524100, ++ 0x6e1d0100, 0xe0524200, ++ 0x6e1d0200, 0xe0524300, ++ 0x6e1d0300, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, +- 0xbef60084, 0xbef600ff, +- 0x01000000, 0xc0211bfa, ++ 0x80f8c078, 0xb8ef1605, ++ 0x806f816f, 0x8e6f846f, ++ 0x8e76826f, 0xbef600ff, ++ 0x01000000, 0xbefc006f, ++ 0xc031003a, 0x00000078, ++ 0x80f8c078, 0xbf8cc07f, ++ 0x80fc907c, 0xbf800000, ++ 0xbe802d00, 0xbe822d02, ++ 0xbe842d04, 0xbe862d06, ++ 0xbe882d08, 0xbe8a2d0a, ++ 0xbe8c2d0c, 0xbe8e2d0e, ++ 0xbf06807c, 0xbf84fff0, ++ 0xb8f82a05, 0x80788178, ++ 0x8e788a78, 0xb8ee1605, ++ 0x806e816e, 0x8e6e866e, ++ 0x80786e78, 0xbef60084, ++ 0xbef600ff, 0x01000000, ++ 0xc0211bfa, 0x00000078, ++ 0x80788478, 0xc0211b3a, + 0x00000078, 0x80788478, +- 0xc0211b3a, 0x00000078, +- 0x80788478, 0xc0211b7a, ++ 0xc0211b7a, 0x00000078, ++ 0x80788478, 0xc0211c3a, + 0x00000078, 0x80788478, +- 0xc0211c3a, 0x00000078, +- 0x80788478, 0xc0211c7a, ++ 0xc0211c7a, 0x00000078, ++ 0x80788478, 0xc0211eba, + 0x00000078, 0x80788478, +- 0xc0211eba, 0x00000078, +- 0x80788478, 0xc0211efa, ++ 0xc0211efa, 0x00000078, ++ 0x80788478, 0xc0211a3a, + 0x00000078, 0x80788478, +- 0xc0211a3a, 0x00000078, +- 0x80788478, 0xc0211a7a, ++ 0xc0211a7a, 0x00000078, ++ 0x80788478, 0xc0211cfa, + 0x00000078, 0x80788478, +- 0xc0211cfa, 0x00000078, +- 0x80788478, 0xbf8cc07f, +- 0xbefc006f, 0xbefe0070, +- 0xbeff0071, 0x866f7bff, +- 0x000003ff, 0xb96f4803, +- 0x866f7bff, 0xfffff800, +- 0x8f6f8b6f, 0xb96fa2c3, +- 0xb973f801, 0xb8ee2a05, +- 0x806e816e, 0x8e6e8a6e, +- 0xb8ef1605, 0x806f816f, +- 0x8e6f866f, 0x806e6f6e, +- 0x806e746e, 0x826f8075, +- 0x866fff6f, 0x0000ffff, +- 0xc00b1c37, 0x00000050, +- 0xc00b1d37, 0x00000060, +- 0xc0031e77, 0x00000074, +- 0xbf8cc07f, 0x866fff6d, +- 0xf8000000, 0x8f6f9b6f, +- 0x8e6f906f, 0xbeee0080, +- 0x876e6f6e, 0x866fff6d, +- 0x04000000, 0x8f6f9a6f, +- 0x8e6f8f6f, 0x876e6f6e, +- 0x866fff7a, 0x00800000, +- 0x8f6f976f, 0xb96ef807, +- 0x866dff6d, 0x0000ffff, +- 0x86fe7e7e, 0x86ea6a6a, +- 0x8f6e837a, 0xb96ee0c2, +- 0xbf800002, 0xb97a0002, +- 0xbf8a0000, 0x95806f6c, +- 0xbf810000, 0x00000000, ++ 0xbf8cc07f, 0xbefc006f, ++ 0xbefe0070, 0xbeff0071, ++ 0x866f7bff, 0x000003ff, ++ 0xb96f4803, 0x866f7bff, ++ 0xfffff800, 0x8f6f8b6f, ++ 0xb96fa2c3, 0xb973f801, ++ 0xb8ee2a05, 0x806e816e, ++ 0x8e6e8a6e, 0xb8ef1605, ++ 0x806f816f, 0x8e6f866f, ++ 0x806e6f6e, 0x806e746e, ++ 0x826f8075, 0x866fff6f, ++ 0x0000ffff, 0xc00b1c37, ++ 0x00000050, 0xc00b1d37, ++ 0x00000060, 0xc0031e77, ++ 0x00000074, 0xbf8cc07f, ++ 0x866fff6d, 0xf8000000, ++ 0x8f6f9b6f, 0x8e6f906f, ++ 0xbeee0080, 0x876e6f6e, ++ 0x866fff6d, 0x04000000, ++ 0x8f6f9a6f, 0x8e6f8f6f, ++ 0x876e6f6e, 0x866fff7a, ++ 0x00800000, 0x8f6f976f, ++ 0xb96ef807, 0x866dff6d, ++ 0x0000ffff, 0x86fe7e7e, ++ 0x86ea6a6a, 0x8f6e837a, ++ 0xb96ee0c2, 0xbf800002, ++ 0xb97a0002, 0xbf8a0000, ++ 0x95806f6c, 0xbf810000, + }; +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +index 6bae2e022c6e..2800e9bba1f9 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +@@ -94,6 +94,7 @@ var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write + var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes + var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing + var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency ++var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger + + /**************************************************************************/ + /* variables */ +@@ -107,6 +108,7 @@ var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 + var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 + var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 + var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 ++var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 + + var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 + var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +@@ -127,6 +129,7 @@ var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 + var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 + var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 + var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 ++var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 + + var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME + var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME +@@ -582,6 +585,16 @@ if G8SR_VGPR_SR_IN_DWX4 + s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes + else ++if SAVE_AFTER_XNACK_ERROR ++ check_if_tcp_store_ok() ++ s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP ++ ++ write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) ++ s_branch L_SAVE_LDS ++ ++L_SAVE_FIRST_VGPRS_WITH_TCP: ++end ++ + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 +@@ -681,6 +694,27 @@ elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss + else // BUFFER_STORE + v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 + v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid ++ ++if SAVE_AFTER_XNACK_ERROR ++ check_if_tcp_store_ok() ++ s_cbranch_scc1 L_SAVE_LDS_WITH_TCP ++ ++ v_lshlrev_b32 v2, 2, v3 ++L_SAVE_LDS_LOOP_SQC: ++ ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 ++ s_waitcnt lgkmcnt(0) ++ ++ write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) ++ ++ v_add_u32 v2, 0x200, v2 ++ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size ++ s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC ++ ++ s_branch L_SAVE_LDS_DONE ++ ++L_SAVE_LDS_WITH_TCP: ++end ++ + v_mul_i32_i24 v2, v3, 8 // tid*8 + v_mov_b32 v3, 256*2 + s_mov_b32 m0, 0x10000 +@@ -767,6 +801,21 @@ else + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later + ++if SAVE_AFTER_XNACK_ERROR ++ check_if_tcp_store_ok() ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP ++ ++L_SAVE_VGPR_LOOP_SQC: ++ write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) ++ ++ s_add_u32 m0, m0, 4 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC ++ ++ s_set_gpr_idx_off ++ s_branch L_SAVE_VGPR_END ++end ++ + L_SAVE_VGPR_LOOP: + v_mov_b32 v0, v0 //v0 = v[0+m0] + v_mov_b32 v1, v1 //v0 = v[0+m0] +@@ -1190,7 +1239,39 @@ function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 + end + ++function check_if_tcp_store_ok ++ // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. ++ s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK ++ s_cbranch_scc1 L_TCP_STORE_CHECK_DONE ++ ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) ++ s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp ++ ++L_TCP_STORE_CHECK_DONE: ++end ++ ++function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset) ++ s_mov_b32 s4, 0 ++ ++L_WRITE_VGPR_LANE_LOOP: ++ for var lane = 0; lane < 4; ++ lane ++ v_readlane_b32 s[lane], v, s4 ++ s_add_u32 s4, s4, 1 ++ end ++ ++ s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 ++ ack_sqc_store_workaround() + ++ s_add_u32 s_mem_offset, s_mem_offset, 0x10 ++ s_cmp_eq_u32 s4, 0x40 ++ s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP ++end ++ ++function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset) ++ for var vgpr = 0; vgpr < n_vgprs; ++ vgpr ++ write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset) ++ end ++end + + function get_lds_size_bytes(s_lds_size_byte) + // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW +-- +2.17.1 + |