1 files changed, 8070 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5618-drm-amdkfd-Roll-back-all-q4-amdkfd-patches-added-by-.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5618-drm-amdkfd-Roll-back-all-q4-amdkfd-patches-added-by-.patch
new file mode 100644
index 00000000..7798330d
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5618-drm-amdkfd-Roll-back-all-q4-amdkfd-patches-added-by-.patch
@@ -0,0 +1,8070 @@
+From f00599ff354b3f061df8ce41217562f7c1bfcc2d Mon Sep 17 00:00:00 2001
+From: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+Date: Wed, 9 Jan 2019 21:21:38 +0530
+Subject: [PATCH 5618/5725] drm/amdkfd: Roll back all q4 amdkfd patches added
+ by Kalyan.
+
+Signed-off-by: Ravi Kumar <ravi1.kumar@amd.com>
+Signed-off-by: Chaudhary Amit Kumar <chaudharyamit.kumar@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/Makefile                |    4 +-
+ drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   |   78 +-
+ drivers/gpu/drm/amd/amdkfd/cik_int.h               |   25 +-
+ drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h     |  568 ----------
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm  |  298 +++++-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm  |  439 +++++---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 1090 ++++++--------------
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c              |   60 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.h              |   48 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c           |   50 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  230 ++---
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  213 ++--
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   16 +-
+ .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c   |    6 +-
+ .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c   |   29 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c          |   22 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c            |  129 +--
+ drivers/gpu/drm/amd/amdkfd/kfd_events.h            |    1 -
+ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   61 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |   81 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_iommu.c             |    3 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_ipc.c               |    2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |   26 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h      |    2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c  |  119 +++
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c   |   78 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c   |  180 ++--
+ drivers/gpu/drm/amd/amdkfd/kfd_module.c            |   21 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |    1 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h       |    2 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |   28 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c    |   63 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |   47 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  102 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c        |    8 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  178 ++--
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  212 ++--
+ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |   26 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_queue.c             |    8 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_rdma.c              |    2 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c          |   94 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.h          |   13 +-
+ drivers/gpu/drm/amd/amdkfd/soc15_int.h             |   41 +-
+ 43 files changed, 1930 insertions(+), 2774 deletions(-)
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+ mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device.c
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
+index 4804f9c..b65537a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/Makefile
++++ b/drivers/gpu/drm/amd/amdkfd/Makefile
+@@ -24,7 +24,9 @@
+ #
+ 
+ FULL_AMD_PATH=$(src)/..
+-ccflags-y := -I$(FULL_AMD_PATH)/include  \
++
++ccflags-y := -Iinclude/drm \
++		-I$(FULL_AMD_PATH)/include/  \
+ 		-I$(FULL_AMD_PATH)/include/asic_reg
+ 
+ amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+index 5d2475d..751c004 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+@@ -24,6 +24,20 @@
+ #include "kfd_events.h"
+ #include "cik_int.h"
+ 
++static bool is_cpc_vm_fault(struct kfd_dev *dev,
++					const uint32_t *ih_ring_entry)
++{
++	const struct cik_ih_ring_entry *ihre =
++			(const struct cik_ih_ring_entry *)ih_ring_entry;
++
++	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
++		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
++	    ihre->vmid >= dev->vm_info.first_vmid_kfd &&
++	    ihre->vmid <= dev->vm_info.last_vmid_kfd)
++		return true;
++	return false;
++}
++
+ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+ 					const uint32_t *ih_ring_entry,
+ 					uint32_t *patched_ihre,
+@@ -32,7 +46,8 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+ 	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+-	unsigned int vmid, pasid;
++	struct cik_ih_ring_entry *tmp_ihre =
++			(struct cik_ih_ring_entry *) patched_ihre;
+ 
+ 	/* This workaround is due to HW/FW limitation on Hawaii that
+ 	 * VMID and PASID are not written into ih_ring_entry
+@@ -40,44 +55,23 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+ 	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+ 		dev->device_info->asic_family == CHIP_HAWAII) {
+-		struct cik_ih_ring_entry *tmp_ihre =
+-			(struct cik_ih_ring_entry *)patched_ihre;
+-
+ 		*patched_flag = true;
+ 		*tmp_ihre = *ihre;
+ 
+-		vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
+-		pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);
+-
+-		tmp_ihre->ring_id &= 0x000000ff;
+-		tmp_ihre->ring_id |= vmid << 8;
+-		tmp_ihre->ring_id |= pasid << 16;
+-
+-		return (pasid != 0) &&
+-			vmid >= dev->vm_info.first_vmid_kfd &&
+-			vmid <= dev->vm_info.last_vmid_kfd;
++		tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
++		tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid(
++						 dev->kgd, tmp_ihre->vmid);
++		return (tmp_ihre->pasid != 0) &&
++			tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd &&
++			tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd;
+ 	}
+-
+-	/* Only handle interrupts from KFD VMIDs */
+-	vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+-	if (vmid < dev->vm_info.first_vmid_kfd ||
+-	    vmid > dev->vm_info.last_vmid_kfd)
+-		return 0;
+-
+-	/* If there is no valid PASID, it's likely a firmware bug */
+-	pasid = (ihre->ring_id & 0xffff0000) >> 16;
+-	if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+-		return 0;
+-
+-	/* Interrupt types we care about: various signals and faults.
+-	 * They will be forwarded to a work queue (see below).
+-	 */
+-	return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
++	/* Do not process in ISR, just request it to be forwarded to WQ. */
++	return (ihre->pasid != 0) &&
++		(ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+ 		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
+ 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
+ 		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+-		ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+-		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
++		is_cpc_vm_fault(dev, ih_ring_entry));
+ }
+ 
+ static void cik_event_interrupt_wq(struct kfd_dev *dev,
+@@ -86,35 +80,33 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+ 	uint32_t context_id = ihre->data & 0xfffffff;
+-	unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+-	unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
+ 
+-	if (pasid == 0)
++	if (ihre->pasid == 0)
+ 		return;
+ 
+ 	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
+-		kfd_signal_event_interrupt(pasid, context_id, 28);
++		kfd_signal_event_interrupt(ihre->pasid, context_id, 28);
+ 	else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
+-		kfd_signal_event_interrupt(pasid, context_id, 28);
++		kfd_signal_event_interrupt(ihre->pasid, context_id, 28);
+ 	else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
+-		kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
++		kfd_signal_event_interrupt(ihre->pasid, context_id & 0xff, 8);
+ 	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
+-		kfd_signal_hw_exception_event(pasid);
++		kfd_signal_hw_exception_event(ihre->pasid);
+ 	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+ 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+ 		struct kfd_vm_fault_info info;
+ 
+-		kfd_process_vm_fault(dev->dqm, pasid);
++		kfd_process_vm_fault(dev->dqm, ihre->pasid);
+ 
+ 		memset(&info, 0, sizeof(info));
+ 		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+ 		if (!info.page_addr && !info.status)
+ 			return;
+ 
+-		if (info.vmid == vmid)
+-			kfd_signal_vm_fault_event(dev, pasid, &info);
++		if (info.vmid == ihre->vmid)
++			kfd_signal_vm_fault_event(dev, ihre->pasid, &info);
+ 		else
+-			kfd_signal_vm_fault_event(dev, pasid, NULL);
++			kfd_signal_vm_fault_event(dev, ihre->pasid, NULL);
+ 	}
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+index a2079a0..ff8255d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+@@ -26,19 +26,32 @@
+ #include <linux/types.h>
+ 
+ struct cik_ih_ring_entry {
+-	uint32_t source_id;
+-	uint32_t data;
+-	uint32_t ring_id;
+-	uint32_t reserved;
++	uint32_t source_id:8;
++	uint32_t reserved1:8;
++	uint32_t reserved2:16;
++
++	uint32_t data:28;
++	uint32_t reserved3:4;
++
++	/* pipeid, meid and unused3 are officially called RINGID,
++	 * but for our purposes, they always decode into pipe and ME.
++	 */
++	uint32_t pipeid:2;
++	uint32_t meid:2;
++	uint32_t reserved4:4;
++	uint32_t vmid:8;
++	uint32_t pasid:16;
++
++	uint32_t reserved5;
+ };
+ 
++#define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
+ #define CIK_INTSRC_CP_END_OF_PIPE	0xB5
+ #define CIK_INTSRC_CP_BAD_OPCODE	0xB7
+-#define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
+-#define CIK_INTSRC_SDMA_TRAP		0xE0
+ #define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
+ #define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
+ #define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93
++#define CIK_INTSRC_SDMA_TRAP		0xE0
+ 
+ #endif
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+deleted file mode 100644
+index 3621efb..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
++++ /dev/null
+@@ -1,568 +0,0 @@
+-/*
+- * Copyright 2018 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-static const uint32_t cwsr_trap_gfx8_hex[] = {
+-	0xbf820001, 0xbf82012b,
+-	0xb8f4f802, 0x89748674,
+-	0xb8f5f803, 0x8675ff75,
+-	0x00000400, 0xbf850017,
+-	0xc00a1e37, 0x00000000,
+-	0xbf8c007f, 0x87777978,
+-	0xbf840005, 0x8f728374,
+-	0xb972e0c2, 0xbf800002,
+-	0xb9740002, 0xbe801d78,
+-	0xb8f5f803, 0x8675ff75,
+-	0x000001ff, 0xbf850002,
+-	0x80708470, 0x82718071,
+-	0x8671ff71, 0x0000ffff,
+-	0x8f728374, 0xb972e0c2,
+-	0xbf800002, 0xb9740002,
+-	0xbe801f70, 0xb8f5f803,
+-	0x8675ff75, 0x00000100,
+-	0xbf840006, 0xbefa0080,
+-	0xb97a0203, 0x8671ff71,
+-	0x0000ffff, 0x80f08870,
+-	0x82f18071, 0xbefa0080,
+-	0xb97a0283, 0xbef60068,
+-	0xbef70069, 0xb8fa1c07,
+-	0x8e7a9c7a, 0x87717a71,
+-	0xb8fa03c7, 0x8e7a9b7a,
+-	0x87717a71, 0xb8faf807,
+-	0x867aff7a, 0x00007fff,
+-	0xb97af807, 0xbef2007e,
+-	0xbef3007f, 0xbefe0180,
+-	0xbf900004, 0x877a8474,
+-	0xb97af802, 0xbf8e0002,
+-	0xbf88fffe, 0xbef8007e,
+-	0x8679ff7f, 0x0000ffff,
+-	0x8779ff79, 0x00040000,
+-	0xbefa0080, 0xbefb00ff,
+-	0x00807fac, 0x867aff7f,
+-	0x08000000, 0x8f7a837a,
+-	0x877b7a7b, 0x867aff7f,
+-	0x70000000, 0x8f7a817a,
+-	0x877b7a7b, 0xbeef007c,
+-	0xbeee0080, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8fa1605, 0x807a817a,
+-	0x8e7a867a, 0x806e7a6e,
+-	0xbefa0084, 0xbefa00ff,
+-	0x01000000, 0xbefe007c,
+-	0xbefc006e, 0xc0611bfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611c3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611c7c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611cbc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611cfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611d3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xb8f5f803,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611d7c, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611dbc, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611dfc, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xb8eff801, 0xbefe007c,
+-	0xbefc006e, 0xc0611bfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611b3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611b7c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0x867aff7f,
+-	0x04000000, 0xbef30080,
+-	0x8773737a, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8f51605, 0x80758175,
+-	0x8e758475, 0x8e7a8275,
+-	0xbefa00ff, 0x01000000,
+-	0xbef60178, 0x80786e78,
+-	0x82798079, 0xbefc0080,
+-	0xbe802b00, 0xbe822b02,
+-	0xbe842b04, 0xbe862b06,
+-	0xbe882b08, 0xbe8a2b0a,
+-	0xbe8c2b0c, 0xbe8e2b0e,
+-	0xc06b003c, 0x00000000,
+-	0xc06b013c, 0x00000010,
+-	0xc06b023c, 0x00000020,
+-	0xc06b033c, 0x00000030,
+-	0x8078c078, 0x82798079,
+-	0x807c907c, 0xbf0a757c,
+-	0xbf85ffeb, 0xbef80176,
+-	0xbeee0080, 0xbefe00c1,
+-	0xbeff00c1, 0xbefa00ff,
+-	0x01000000, 0xe0724000,
+-	0x6e1e0000, 0xe0724100,
+-	0x6e1e0100, 0xe0724200,
+-	0x6e1e0200, 0xe0724300,
+-	0x6e1e0300, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f54306,
+-	0x8675c175, 0xbf84002c,
+-	0xbf8a0000, 0x867aff73,
+-	0x04000000, 0xbf840028,
+-	0x8e758675, 0x8e758275,
+-	0xbefa0075, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8fa1605, 0x807a817a,
+-	0x8e7a867a, 0x806e7a6e,
+-	0x806eff6e, 0x00000080,
+-	0xbefa00ff, 0x01000000,
+-	0xbefc0080, 0xd28c0002,
+-	0x000100c1, 0xd28d0003,
+-	0x000204c1, 0xd1060002,
+-	0x00011103, 0x7e0602ff,
+-	0x00000200, 0xbefc00ff,
+-	0x00010000, 0xbe80007b,
+-	0x867bff7b, 0xff7fffff,
+-	0x877bff7b, 0x00058000,
+-	0xd8ec0000, 0x00000002,
+-	0xbf8c007f, 0xe0765000,
+-	0x6e1e0002, 0x32040702,
+-	0xd0c9006a, 0x0000eb02,
+-	0xbf87fff7, 0xbefb0000,
+-	0xbeee00ff, 0x00000400,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f52a05, 0x80758175,
+-	0x8e758275, 0x8e7a8875,
+-	0xbefa00ff, 0x01000000,
+-	0xbefc0084, 0xbf0a757c,
+-	0xbf840015, 0xbf11017c,
+-	0x8075ff75, 0x00001000,
+-	0x7e000300, 0x7e020301,
+-	0x7e040302, 0x7e060303,
+-	0xe0724000, 0x6e1e0000,
+-	0xe0724100, 0x6e1e0100,
+-	0xe0724200, 0x6e1e0200,
+-	0xe0724300, 0x6e1e0300,
+-	0x807c847c, 0x806eff6e,
+-	0x00000400, 0xbf0a757c,
+-	0xbf85ffef, 0xbf9c0000,
+-	0xbf8200cd, 0xbef8007e,
+-	0x8679ff7f, 0x0000ffff,
+-	0x8779ff79, 0x00040000,
+-	0xbefa0080, 0xbefb00ff,
+-	0x00807fac, 0x8676ff7f,
+-	0x08000000, 0x8f768376,
+-	0x877b767b, 0x8676ff7f,
+-	0x70000000, 0x8f768176,
+-	0x877b767b, 0x8676ff7f,
+-	0x04000000, 0xbf84001e,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f34306, 0x8673c173,
+-	0xbf840019, 0x8e738673,
+-	0x8e738273, 0xbefa0073,
+-	0xb8f22a05, 0x80728172,
+-	0x8e728a72, 0xb8f61605,
+-	0x80768176, 0x8e768676,
+-	0x80727672, 0x8072ff72,
+-	0x00000080, 0xbefa00ff,
+-	0x01000000, 0xbefc0080,
+-	0xe0510000, 0x721e0000,
+-	0xe0510100, 0x721e0000,
+-	0x807cff7c, 0x00000200,
+-	0x8072ff72, 0x00000200,
+-	0xbf0a737c, 0xbf85fff6,
+-	0xbef20080, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f32a05,
+-	0x80738173, 0x8e738273,
+-	0x8e7a8873, 0xbefa00ff,
+-	0x01000000, 0xbef60072,
+-	0x8072ff72, 0x00000400,
+-	0xbefc0084, 0xbf11087c,
+-	0x8073ff73, 0x00008000,
+-	0xe0524000, 0x721e0000,
+-	0xe0524100, 0x721e0100,
+-	0xe0524200, 0x721e0200,
+-	0xe0524300, 0x721e0300,
+-	0xbf8c0f70, 0x7e000300,
+-	0x7e020301, 0x7e040302,
+-	0x7e060303, 0x807c847c,
+-	0x8072ff72, 0x00000400,
+-	0xbf0a737c, 0xbf85ffee,
+-	0xbf9c0000, 0xe0524000,
+-	0x761e0000, 0xe0524100,
+-	0x761e0100, 0xe0524200,
+-	0x761e0200, 0xe0524300,
+-	0x761e0300, 0xb8f22a05,
+-	0x80728172, 0x8e728a72,
+-	0xb8f61605, 0x80768176,
+-	0x8e768676, 0x80727672,
+-	0x80f2c072, 0xb8f31605,
+-	0x80738173, 0x8e738473,
+-	0x8e7a8273, 0xbefa00ff,
+-	0x01000000, 0xbefc0073,
+-	0xc031003c, 0x00000072,
+-	0x80f2c072, 0xbf8c007f,
+-	0x80fc907c, 0xbe802d00,
+-	0xbe822d02, 0xbe842d04,
+-	0xbe862d06, 0xbe882d08,
+-	0xbe8a2d0a, 0xbe8c2d0c,
+-	0xbe8e2d0e, 0xbf06807c,
+-	0xbf84fff1, 0xb8f22a05,
+-	0x80728172, 0x8e728a72,
+-	0xb8f61605, 0x80768176,
+-	0x8e768676, 0x80727672,
+-	0xbefa0084, 0xbefa00ff,
+-	0x01000000, 0xc0211cfc,
+-	0x00000072, 0x80728472,
+-	0xc0211c3c, 0x00000072,
+-	0x80728472, 0xc0211c7c,
+-	0x00000072, 0x80728472,
+-	0xc0211bbc, 0x00000072,
+-	0x80728472, 0xc0211bfc,
+-	0x00000072, 0x80728472,
+-	0xc0211d3c, 0x00000072,
+-	0x80728472, 0xc0211d7c,
+-	0x00000072, 0x80728472,
+-	0xc0211a3c, 0x00000072,
+-	0x80728472, 0xc0211a7c,
+-	0x00000072, 0x80728472,
+-	0xc0211dfc, 0x00000072,
+-	0x80728472, 0xc0211b3c,
+-	0x00000072, 0x80728472,
+-	0xc0211b7c, 0x00000072,
+-	0x80728472, 0xbf8c007f,
+-	0xbefc0073, 0xbefe006e,
+-	0xbeff006f, 0x867375ff,
+-	0x000003ff, 0xb9734803,
+-	0x867375ff, 0xfffff800,
+-	0x8f738b73, 0xb973a2c3,
+-	0xb977f801, 0x8673ff71,
+-	0xf0000000, 0x8f739c73,
+-	0x8e739073, 0xbef60080,
+-	0x87767376, 0x8673ff71,
+-	0x08000000, 0x8f739b73,
+-	0x8e738f73, 0x87767376,
+-	0x8673ff74, 0x00800000,
+-	0x8f739773, 0xb976f807,
+-	0x8671ff71, 0x0000ffff,
+-	0x86fe7e7e, 0x86ea6a6a,
+-	0x8f768374, 0xb976e0c2,
+-	0xbf800002, 0xb9740002,
+-	0xbf8a0000, 0x95807370,
+-	0xbf810000, 0x00000000,
+-};
+-
+-
+-static const uint32_t cwsr_trap_gfx9_hex[] = {
+-	0xbf820001, 0xbf82015d,
+-	0xb8f8f802, 0x89788678,
+-	0xb8f1f803, 0x866eff71,
+-	0x00000400, 0xbf850037,
+-	0x866eff71, 0x00000800,
+-	0xbf850003, 0x866eff71,
+-	0x00000100, 0xbf840008,
+-	0x866eff78, 0x00002000,
+-	0xbf840001, 0xbf810000,
+-	0x8778ff78, 0x00002000,
+-	0x80ec886c, 0x82ed806d,
+-	0xb8eef807, 0x866fff6e,
+-	0x001f8000, 0x8e6f8b6f,
+-	0x8977ff77, 0xfc000000,
+-	0x87776f77, 0x896eff6e,
+-	0x001f8000, 0xb96ef807,
+-	0xb8f0f812, 0xb8f1f813,
+-	0x8ef08870, 0xc0071bb8,
+-	0x00000000, 0xbf8cc07f,
+-	0xc0071c38, 0x00000008,
+-	0xbf8cc07f, 0x86ee6e6e,
+-	0xbf840001, 0xbe801d6e,
+-	0xb8f1f803, 0x8671ff71,
+-	0x000001ff, 0xbf850002,
+-	0x806c846c, 0x826d806d,
+-	0x866dff6d, 0x0000ffff,
+-	0x8f6e8b77, 0x866eff6e,
+-	0x001f8000, 0xb96ef807,
+-	0x86fe7e7e, 0x86ea6a6a,
+-	0x8f6e8378, 0xb96ee0c2,
+-	0xbf800002, 0xb9780002,
+-	0xbe801f6c, 0x866dff6d,
+-	0x0000ffff, 0xbef00080,
+-	0xb9700283, 0xb8f02407,
+-	0x8e709c70, 0x876d706d,
+-	0xb8f003c7, 0x8e709b70,
+-	0x876d706d, 0xb8f0f807,
+-	0x8670ff70, 0x00007fff,
+-	0xb970f807, 0xbeee007e,
+-	0xbeef007f, 0xbefe0180,
+-	0xbf900004, 0x87708478,
+-	0xb970f802, 0xbf8e0002,
+-	0xbf88fffe, 0xb8f02a05,
+-	0x80708170, 0x8e708a70,
+-	0xb8f11605, 0x80718171,
+-	0x8e718671, 0x80707170,
+-	0x80707e70, 0x8271807f,
+-	0x8671ff71, 0x0000ffff,
+-	0xc0471cb8, 0x00000040,
+-	0xbf8cc07f, 0xc04b1d38,
+-	0x00000048, 0xbf8cc07f,
+-	0xc0431e78, 0x00000058,
+-	0xbf8cc07f, 0xc0471eb8,
+-	0x0000005c, 0xbf8cc07f,
+-	0xbef4007e, 0x8675ff7f,
+-	0x0000ffff, 0x8775ff75,
+-	0x00040000, 0xbef60080,
+-	0xbef700ff, 0x00807fac,
+-	0x8670ff7f, 0x08000000,
+-	0x8f708370, 0x87777077,
+-	0x8670ff7f, 0x70000000,
+-	0x8f708170, 0x87777077,
+-	0xbefb007c, 0xbefa0080,
+-	0xb8fa2a05, 0x807a817a,
+-	0x8e7a8a7a, 0xb8f01605,
+-	0x80708170, 0x8e708670,
+-	0x807a707a, 0xbef60084,
+-	0xbef600ff, 0x01000000,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611efa, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611b3a,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611b7a, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611bba,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611bfa, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611e3a,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xb8f1f803, 0xbefe007c,
+-	0xbefc007a, 0xc0611c7a,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611a3a, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611a7a,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xb8fbf801, 0xbefe007c,
+-	0xbefc007a, 0xc0611efa,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0x8670ff7f, 0x04000000,
+-	0xbeef0080, 0x876f6f70,
+-	0xb8fa2a05, 0x807a817a,
+-	0x8e7a8a7a, 0xb8f11605,
+-	0x80718171, 0x8e718471,
+-	0x8e768271, 0xbef600ff,
+-	0x01000000, 0xbef20174,
+-	0x80747a74, 0x82758075,
+-	0xbefc0080, 0xbf800000,
+-	0xbe802b00, 0xbe822b02,
+-	0xbe842b04, 0xbe862b06,
+-	0xbe882b08, 0xbe8a2b0a,
+-	0xbe8c2b0c, 0xbe8e2b0e,
+-	0xc06b003a, 0x00000000,
+-	0xbf8cc07f, 0xc06b013a,
+-	0x00000010, 0xbf8cc07f,
+-	0xc06b023a, 0x00000020,
+-	0xbf8cc07f, 0xc06b033a,
+-	0x00000030, 0xbf8cc07f,
+-	0x8074c074, 0x82758075,
+-	0x807c907c, 0xbf0a717c,
+-	0xbf85ffe7, 0xbef40172,
+-	0xbefa0080, 0xbefe00c1,
+-	0xbeff00c1, 0xbee80080,
+-	0xbee90080, 0xbef600ff,
+-	0x01000000, 0xe0724000,
+-	0x7a1d0000, 0xe0724100,
+-	0x7a1d0100, 0xe0724200,
+-	0x7a1d0200, 0xe0724300,
+-	0x7a1d0300, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f14306,
+-	0x8671c171, 0xbf84002c,
+-	0xbf8a0000, 0x8670ff6f,
+-	0x04000000, 0xbf840028,
+-	0x8e718671, 0x8e718271,
+-	0xbef60071, 0xb8fa2a05,
+-	0x807a817a, 0x8e7a8a7a,
+-	0xb8f01605, 0x80708170,
+-	0x8e708670, 0x807a707a,
+-	0x807aff7a, 0x00000080,
+-	0xbef600ff, 0x01000000,
+-	0xbefc0080, 0xd28c0002,
+-	0x000100c1, 0xd28d0003,
+-	0x000204c1, 0xd1060002,
+-	0x00011103, 0x7e0602ff,
+-	0x00000200, 0xbefc00ff,
+-	0x00010000, 0xbe800077,
+-	0x8677ff77, 0xff7fffff,
+-	0x8777ff77, 0x00058000,
+-	0xd8ec0000, 0x00000002,
+-	0xbf8cc07f, 0xe0765000,
+-	0x7a1d0002, 0x68040702,
+-	0xd0c9006a, 0x0000e302,
+-	0xbf87fff7, 0xbef70000,
+-	0xbefa00ff, 0x00000400,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f12a05, 0x80718171,
+-	0x8e718271, 0x8e768871,
+-	0xbef600ff, 0x01000000,
+-	0xbefc0084, 0xbf0a717c,
+-	0xbf840015, 0xbf11017c,
+-	0x8071ff71, 0x00001000,
+-	0x7e000300, 0x7e020301,
+-	0x7e040302, 0x7e060303,
+-	0xe0724000, 0x7a1d0000,
+-	0xe0724100, 0x7a1d0100,
+-	0xe0724200, 0x7a1d0200,
+-	0xe0724300, 0x7a1d0300,
+-	0x807c847c, 0x807aff7a,
+-	0x00000400, 0xbf0a717c,
+-	0xbf85ffef, 0xbf9c0000,
+-	0xbf8200dc, 0xbef4007e,
+-	0x8675ff7f, 0x0000ffff,
+-	0x8775ff75, 0x00040000,
+-	0xbef60080, 0xbef700ff,
+-	0x00807fac, 0x866eff7f,
+-	0x08000000, 0x8f6e836e,
+-	0x87776e77, 0x866eff7f,
+-	0x70000000, 0x8f6e816e,
+-	0x87776e77, 0x866eff7f,
+-	0x04000000, 0xbf84001e,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8ef4306, 0x866fc16f,
+-	0xbf840019, 0x8e6f866f,
+-	0x8e6f826f, 0xbef6006f,
+-	0xb8f82a05, 0x80788178,
+-	0x8e788a78, 0xb8ee1605,
+-	0x806e816e, 0x8e6e866e,
+-	0x80786e78, 0x8078ff78,
+-	0x00000080, 0xbef600ff,
+-	0x01000000, 0xbefc0080,
+-	0xe0510000, 0x781d0000,
+-	0xe0510100, 0x781d0000,
+-	0x807cff7c, 0x00000200,
+-	0x8078ff78, 0x00000200,
+-	0xbf0a6f7c, 0xbf85fff6,
+-	0xbef80080, 0xbefe00c1,
+-	0xbeff00c1, 0xb8ef2a05,
+-	0x806f816f, 0x8e6f826f,
+-	0x8e76886f, 0xbef600ff,
+-	0x01000000, 0xbeee0078,
+-	0x8078ff78, 0x00000400,
+-	0xbefc0084, 0xbf11087c,
+-	0x806fff6f, 0x00008000,
+-	0xe0524000, 0x781d0000,
+-	0xe0524100, 0x781d0100,
+-	0xe0524200, 0x781d0200,
+-	0xe0524300, 0x781d0300,
+-	0xbf8c0f70, 0x7e000300,
+-	0x7e020301, 0x7e040302,
+-	0x7e060303, 0x807c847c,
+-	0x8078ff78, 0x00000400,
+-	0xbf0a6f7c, 0xbf85ffee,
+-	0xbf9c0000, 0xe0524000,
+-	0x6e1d0000, 0xe0524100,
+-	0x6e1d0100, 0xe0524200,
+-	0x6e1d0200, 0xe0524300,
+-	0x6e1d0300, 0xb8f82a05,
+-	0x80788178, 0x8e788a78,
+-	0xb8ee1605, 0x806e816e,
+-	0x8e6e866e, 0x80786e78,
+-	0x80f8c078, 0xb8ef1605,
+-	0x806f816f, 0x8e6f846f,
+-	0x8e76826f, 0xbef600ff,
+-	0x01000000, 0xbefc006f,
+-	0xc031003a, 0x00000078,
+-	0x80f8c078, 0xbf8cc07f,
+-	0x80fc907c, 0xbf800000,
+-	0xbe802d00, 0xbe822d02,
+-	0xbe842d04, 0xbe862d06,
+-	0xbe882d08, 0xbe8a2d0a,
+-	0xbe8c2d0c, 0xbe8e2d0e,
+-	0xbf06807c, 0xbf84fff0,
+-	0xb8f82a05, 0x80788178,
+-	0x8e788a78, 0xb8ee1605,
+-	0x806e816e, 0x8e6e866e,
+-	0x80786e78, 0xbef60084,
+-	0xbef600ff, 0x01000000,
+-	0xc0211bfa, 0x00000078,
+-	0x80788478, 0xc0211b3a,
+-	0x00000078, 0x80788478,
+-	0xc0211b7a, 0x00000078,
+-	0x80788478, 0xc0211eba,
+-	0x00000078, 0x80788478,
+-	0xc0211efa, 0x00000078,
+-	0x80788478, 0xc0211c3a,
+-	0x00000078, 0x80788478,
+-	0xc0211c7a, 0x00000078,
+-	0x80788478, 0xc0211a3a,
+-	0x00000078, 0x80788478,
+-	0xc0211a7a, 0x00000078,
+-	0x80788478, 0xc0211cfa,
+-	0x00000078, 0x80788478,
+-	0xbf8cc07f, 0xbefc006f,
+-	0xbefe007a, 0xbeff007b,
+-	0x866f71ff, 0x000003ff,
+-	0xb96f4803, 0x866f71ff,
+-	0xfffff800, 0x8f6f8b6f,
+-	0xb96fa2c3, 0xb973f801,
+-	0xb8ee2a05, 0x806e816e,
+-	0x8e6e8a6e, 0xb8ef1605,
+-	0x806f816f, 0x8e6f866f,
+-	0x806e6f6e, 0x806e746e,
+-	0x826f8075, 0x866fff6f,
+-	0x0000ffff, 0xc0071cb7,
+-	0x00000040, 0xc00b1d37,
+-	0x00000048, 0xc0031e77,
+-	0x00000058, 0xc0071eb7,
+-	0x0000005c, 0xbf8cc07f,
+-	0x866fff6d, 0xf0000000,
+-	0x8f6f9c6f, 0x8e6f906f,
+-	0xbeee0080, 0x876e6f6e,
+-	0x866fff6d, 0x08000000,
+-	0x8f6f9b6f, 0x8e6f8f6f,
+-	0x876e6f6e, 0x866fff70,
+-	0x00800000, 0x8f6f976f,
+-	0xb96ef807, 0x866dff6d,
+-	0x0000ffff, 0x86fe7e7e,
+-	0x86ea6a6a, 0x8f6e8370,
+-	0xb96ee0c2, 0xbf800002,
+-	0xb9700002, 0xbf8a0000,
+-	0x95806f6c, 0xbf810000,
+-};
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+index abe1a5d..751cc2e 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+@@ -20,12 +20,9 @@
+  * OTHER DEALINGS IN THE SOFTWARE.
+  */
+ 
+-/* To compile this assembly code:
+- * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
+- */
+-
+-/* HW (VI) source code for CWSR trap handler */
+-/* Version 18 + multiple trap handler */
++#if 0
++HW (VI) source code for CWSR trap handler
++#Version 18 + multiple trap handler
+ 
+ // this performance-optimal version was originally from Seven Xu at SRDC
+ 
+@@ -77,7 +74,7 @@ var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_D
+ /*************************************************************************/
+ /*                  control on how to run the shader                     */
+ /*************************************************************************/
+-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+ var EMU_RUN_HACK                    =   0
+ var EMU_RUN_HACK_RESTORE_NORMAL     =   0
+ var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
+@@ -91,9 +88,9 @@ var WG_BASE_ADDR_HI                 =   0x0
+ var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
+ var CTX_SAVE_CONTROL                =   0x0
+ var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
+-var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
++var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+ var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
+-var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
++var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+ var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
+ 
+ /**************************************************************************/
+@@ -101,12 +98,7 @@ var SWIZZLE_EN                      =   0                   //whether we use swi
+ /**************************************************************************/
+ var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+ var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+-var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
+ var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+-var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
+-var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
+-var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
+-var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
+ 
+ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
+ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
+@@ -157,7 +149,7 @@ var s_save_spi_init_lo              =   exec_lo
+ var s_save_spi_init_hi              =   exec_hi
+ 
+                                                 //tba_lo and tba_hi need to be saved/restored
+-var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
++var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+ var s_save_pc_hi            =   ttmp1
+ var s_save_exec_lo          =   ttmp2
+ var s_save_exec_hi          =   ttmp3
+@@ -255,7 +247,7 @@ if (!EMU_RUN_HACK)
+     s_waitcnt lgkmcnt(0)
+     s_or_b32        ttmp7, ttmp8, ttmp9
+     s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
+-    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
++    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+     s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
+ 
+ L_NO_NEXT_TRAP:
+@@ -266,7 +258,7 @@ L_NO_NEXT_TRAP:
+     s_addc_u32  ttmp1, ttmp1, 0
+ L_EXCP_CASE:
+     s_and_b32   ttmp1, ttmp1, 0xFFFF
+-    set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
++    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+     s_rfe_b64       [ttmp0, ttmp1]
+ end
+     // *********        End handling of non-CWSR traps   *******************
+@@ -327,10 +319,6 @@ end
+         s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+     end
+ 
+-    // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+-    s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+-    s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+-
+   L_SLEEP:
+     s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+ 
+@@ -1019,6 +1007,8 @@ end
+ 
+     s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
+ 
++    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
++
+     //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+         s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
+@@ -1054,12 +1044,11 @@ end
+     s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+     s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+ 
+-    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
+     s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+     s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
+-    set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
++    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
+ 
+-    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
++    s_barrier                                                   //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+ 
+ if G8SR_DEBUG_TIMESTAMP
+     s_memrealtime s_g8sr_ts_restore_d
+@@ -1139,10 +1128,257 @@ function get_hwreg_size_bytes
+     return 128 //HWREG size 128 bytes
+ end
+ 
+-function set_status_without_spi_prio(status, tmp)
+-    // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
+-    s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
+-    s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
+-    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
+-end
++
++#endif
++
++static const uint32_t cwsr_trap_gfx8_hex[] = {
++	0xbf820001, 0xbf820123,
++	0xb8f4f802, 0x89748674,
++	0xb8f5f803, 0x8675ff75,
++	0x00000400, 0xbf850011,
++	0xc00a1e37, 0x00000000,
++	0xbf8c007f, 0x87777978,
++	0xbf840002, 0xb974f802,
++	0xbe801d78, 0xb8f5f803,
++	0x8675ff75, 0x000001ff,
++	0xbf850002, 0x80708470,
++	0x82718071, 0x8671ff71,
++	0x0000ffff, 0xb974f802,
++	0xbe801f70, 0xb8f5f803,
++	0x8675ff75, 0x00000100,
++	0xbf840006, 0xbefa0080,
++	0xb97a0203, 0x8671ff71,
++	0x0000ffff, 0x80f08870,
++	0x82f18071, 0xbefa0080,
++	0xb97a0283, 0xbef60068,
++	0xbef70069, 0xb8fa1c07,
++	0x8e7a9c7a, 0x87717a71,
++	0xb8fa03c7, 0x8e7a9b7a,
++	0x87717a71, 0xb8faf807,
++	0x867aff7a, 0x00007fff,
++	0xb97af807, 0xbef2007e,
++	0xbef3007f, 0xbefe0180,
++	0xbf900004, 0xbf8e0002,
++	0xbf88fffe, 0xbef8007e,
++	0x8679ff7f, 0x0000ffff,
++	0x8779ff79, 0x00040000,
++	0xbefa0080, 0xbefb00ff,
++	0x00807fac, 0x867aff7f,
++	0x08000000, 0x8f7a837a,
++	0x877b7a7b, 0x867aff7f,
++	0x70000000, 0x8f7a817a,
++	0x877b7a7b, 0xbeef007c,
++	0xbeee0080, 0xb8ee2a05,
++	0x806e816e, 0x8e6e8a6e,
++	0xb8fa1605, 0x807a817a,
++	0x8e7a867a, 0x806e7a6e,
++	0xbefa0084, 0xbefa00ff,
++	0x01000000, 0xbefe007c,
++	0xbefc006e, 0xc0611bfc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611c3c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611c7c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611cbc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611cfc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611d3c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xb8f5f803,
++	0xbefe007c, 0xbefc006e,
++	0xc0611d7c, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0611dbc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xbefe007c, 0xbefc006e,
++	0xc0611dfc, 0x0000007c,
++	0x806e846e, 0xbefc007e,
++	0xb8eff801, 0xbefe007c,
++	0xbefc006e, 0xc0611bfc,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611b3c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0xbefe007c,
++	0xbefc006e, 0xc0611b7c,
++	0x0000007c, 0x806e846e,
++	0xbefc007e, 0x867aff7f,
++	0x04000000, 0xbef30080,
++	0x8773737a, 0xb8ee2a05,
++	0x806e816e, 0x8e6e8a6e,
++	0xb8f51605, 0x80758175,
++	0x8e758475, 0x8e7a8275,
++	0xbefa00ff, 0x01000000,
++	0xbef60178, 0x80786e78,
++	0x82798079, 0xbefc0080,
++	0xbe802b00, 0xbe822b02,
++	0xbe842b04, 0xbe862b06,
++	0xbe882b08, 0xbe8a2b0a,
++	0xbe8c2b0c, 0xbe8e2b0e,
++	0xc06b003c, 0x00000000,
++	0xc06b013c, 0x00000010,
++	0xc06b023c, 0x00000020,
++	0xc06b033c, 0x00000030,
++	0x8078c078, 0x82798079,
++	0x807c907c, 0xbf0a757c,
++	0xbf85ffeb, 0xbef80176,
++	0xbeee0080, 0xbefe00c1,
++	0xbeff00c1, 0xbefa00ff,
++	0x01000000, 0xe0724000,
++	0x6e1e0000, 0xe0724100,
++	0x6e1e0100, 0xe0724200,
++	0x6e1e0200, 0xe0724300,
++	0x6e1e0300, 0xbefe00c1,
++	0xbeff00c1, 0xb8f54306,
++	0x8675c175, 0xbf84002c,
++	0xbf8a0000, 0x867aff73,
++	0x04000000, 0xbf840028,
++	0x8e758675, 0x8e758275,
++	0xbefa0075, 0xb8ee2a05,
++	0x806e816e, 0x8e6e8a6e,
++	0xb8fa1605, 0x807a817a,
++	0x8e7a867a, 0x806e7a6e,
++	0x806eff6e, 0x00000080,
++	0xbefa00ff, 0x01000000,
++	0xbefc0080, 0xd28c0002,
++	0x000100c1, 0xd28d0003,
++	0x000204c1, 0xd1060002,
++	0x00011103, 0x7e0602ff,
++	0x00000200, 0xbefc00ff,
++	0x00010000, 0xbe80007b,
++	0x867bff7b, 0xff7fffff,
++	0x877bff7b, 0x00058000,
++	0xd8ec0000, 0x00000002,
++	0xbf8c007f, 0xe0765000,
++	0x6e1e0002, 0x32040702,
++	0xd0c9006a, 0x0000eb02,
++	0xbf87fff7, 0xbefb0000,
++	0xbeee00ff, 0x00000400,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8f52a05, 0x80758175,
++	0x8e758275, 0x8e7a8875,
++	0xbefa00ff, 0x01000000,
++	0xbefc0084, 0xbf0a757c,
++	0xbf840015, 0xbf11017c,
++	0x8075ff75, 0x00001000,
++	0x7e000300, 0x7e020301,
++	0x7e040302, 0x7e060303,
++	0xe0724000, 0x6e1e0000,
++	0xe0724100, 0x6e1e0100,
++	0xe0724200, 0x6e1e0200,
++	0xe0724300, 0x6e1e0300,
++	0x807c847c, 0x806eff6e,
++	0x00000400, 0xbf0a757c,
++	0xbf85ffef, 0xbf9c0000,
++	0xbf8200ca, 0xbef8007e,
++	0x8679ff7f, 0x0000ffff,
++	0x8779ff79, 0x00040000,
++	0xbefa0080, 0xbefb00ff,
++	0x00807fac, 0x8676ff7f,
++	0x08000000, 0x8f768376,
++	0x877b767b, 0x8676ff7f,
++	0x70000000, 0x8f768176,
++	0x877b767b, 0x8676ff7f,
++	0x04000000, 0xbf84001e,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8f34306, 0x8673c173,
++	0xbf840019, 0x8e738673,
++	0x8e738273, 0xbefa0073,
++	0xb8f22a05, 0x80728172,
++	0x8e728a72, 0xb8f61605,
++	0x80768176, 0x8e768676,
++	0x80727672, 0x8072ff72,
++	0x00000080, 0xbefa00ff,
++	0x01000000, 0xbefc0080,
++	0xe0510000, 0x721e0000,
++	0xe0510100, 0x721e0000,
++	0x807cff7c, 0x00000200,
++	0x8072ff72, 0x00000200,
++	0xbf0a737c, 0xbf85fff6,
++	0xbef20080, 0xbefe00c1,
++	0xbeff00c1, 0xb8f32a05,
++	0x80738173, 0x8e738273,
++	0x8e7a8873, 0xbefa00ff,
++	0x01000000, 0xbef60072,
++	0x8072ff72, 0x00000400,
++	0xbefc0084, 0xbf11087c,
++	0x8073ff73, 0x00008000,
++	0xe0524000, 0x721e0000,
++	0xe0524100, 0x721e0100,
++	0xe0524200, 0x721e0200,
++	0xe0524300, 0x721e0300,
++	0xbf8c0f70, 0x7e000300,
++	0x7e020301, 0x7e040302,
++	0x7e060303, 0x807c847c,
++	0x8072ff72, 0x00000400,
++	0xbf0a737c, 0xbf85ffee,
++	0xbf9c0000, 0xe0524000,
++	0x761e0000, 0xe0524100,
++	0x761e0100, 0xe0524200,
++	0x761e0200, 0xe0524300,
++	0x761e0300, 0xb8f22a05,
++	0x80728172, 0x8e728a72,
++	0xb8f61605, 0x80768176,
++	0x8e768676, 0x80727672,
++	0x80f2c072, 0xb8f31605,
++	0x80738173, 0x8e738473,
++	0x8e7a8273, 0xbefa00ff,
++	0x01000000, 0xbefc0073,
++	0xc031003c, 0x00000072,
++	0x80f2c072, 0xbf8c007f,
++	0x80fc907c, 0xbe802d00,
++	0xbe822d02, 0xbe842d04,
++	0xbe862d06, 0xbe882d08,
++	0xbe8a2d0a, 0xbe8c2d0c,
++	0xbe8e2d0e, 0xbf06807c,
++	0xbf84fff1, 0xb8f22a05,
++	0x80728172, 0x8e728a72,
++	0xb8f61605, 0x80768176,
++	0x8e768676, 0x80727672,
++	0xbefa0084, 0xbefa00ff,
++	0x01000000, 0xc0211cfc,
++	0x00000072, 0x80728472,
++	0xc0211c3c, 0x00000072,
++	0x80728472, 0xc0211c7c,
++	0x00000072, 0x80728472,
++	0xc0211bbc, 0x00000072,
++	0x80728472, 0xc0211bfc,
++	0x00000072, 0x80728472,
++	0xc0211d3c, 0x00000072,
++	0x80728472, 0xc0211d7c,
++	0x00000072, 0x80728472,
++	0xc0211a3c, 0x00000072,
++	0x80728472, 0xc0211a7c,
++	0x00000072, 0x80728472,
++	0xc0211dfc, 0x00000072,
++	0x80728472, 0xc0211b3c,
++	0x00000072, 0x80728472,
++	0xc0211b7c, 0x00000072,
++	0x80728472, 0xbf8c007f,
++	0x8671ff71, 0x0000ffff,
++	0xbefc0073, 0xbefe006e,
++	0xbeff006f, 0x867375ff,
++	0x000003ff, 0xb9734803,
++	0x867375ff, 0xfffff800,
++	0x8f738b73, 0xb973a2c3,
++	0xb977f801, 0x8673ff71,
++	0xf0000000, 0x8f739c73,
++	0x8e739073, 0xbef60080,
++	0x87767376, 0x8673ff71,
++	0x08000000, 0x8f739b73,
++	0x8e738f73, 0x87767376,
++	0x8673ff74, 0x00800000,
++	0x8f739773, 0xb976f807,
++	0x86fe7e7e, 0x86ea6a6a,
++	0xb974f802, 0xbf8a0000,
++	0x95807370, 0xbf810000,
++};
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+index 0bb9c57..bd2957c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+@@ -20,12 +20,9 @@
+  * OTHER DEALINGS IN THE SOFTWARE.
+  */
+ 
+-/* To compile this assembly code:
+- * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex
+- */
+-
+-/* HW (GFX9) source code for CWSR trap handler */
+-/* Version 18 + multiple trap handler */
++#if 0
++HW (GFX9) source code for CWSR trap handler
++#Version 18 + multiple trap handler
+ 
+ // this performance-optimal version was originally from Seven Xu at SRDC
+ 
+@@ -77,7 +74,7 @@ var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_D
+ /*************************************************************************/
+ /*		    control on how to run the shader			 */
+ /*************************************************************************/
+-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+ var EMU_RUN_HACK		    =	0
+ var EMU_RUN_HACK_RESTORE_NORMAL	    =	0
+ var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =	0
+@@ -89,9 +86,9 @@ var WG_BASE_ADDR_HI		    =	0x0
+ var WAVE_SPACE			    =	0x5000		    //memory size that each wave occupies in workgroup state mem
+ var CTX_SAVE_CONTROL		    =	0x0
+ var CTX_RESTORE_CONTROL		    =	CTX_SAVE_CONTROL
+-var SIM_RUN_HACK		    =	0		    //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
++var SIM_RUN_HACK		    =	0		    //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+ var SGPR_SAVE_USE_SQC		    =	1		    //use SQC D$ to do the write
+-var USE_MTBUF_INSTEAD_OF_MUBUF	    =	0		    //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
++var USE_MTBUF_INSTEAD_OF_MUBUF	    =	0		    //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+ var SWIZZLE_EN			    =	0		    //whether we use swizzled buffer addressing
+ var ACK_SQC_STORE		    =	1		    //workaround for suspected SQC store bug causing incorrect stores under concurrency
+ 
+@@ -100,13 +97,8 @@ var ACK_SQC_STORE		    =	1		    //workaround for suspected SQC store bug causing
+ /**************************************************************************/
+ var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+ var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+-var SQ_WAVE_STATUS_SPI_PRIO_SHIFT  = 1
+ var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+ var SQ_WAVE_STATUS_HALT_MASK       = 0x2000
+-var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
+-var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
+-var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
+-var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
+ 
+ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
+ var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
+@@ -130,14 +122,11 @@ var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK	=   0x800
+ 
+ var SQ_WAVE_IB_STS_RCNT_SHIFT		=   16			//FIXME
+ var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=   15			//FIXME
+-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x1F8000
+ var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
+ 
+ var SQ_BUF_RSRC_WORD1_ATC_SHIFT	    =	24
+ var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =	27
+ 
+-var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT	=   26			// bits [31:26] unused by SPI debug data
+-var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK	=   0xFC000000
+ 
+ /*	Save	    */
+ var S_SAVE_BUF_RSRC_WORD1_STRIDE	=   0x00040000		//stride is 4 bytes
+@@ -158,11 +147,11 @@ var S_SAVE_PC_HI_FIRST_REPLAY_MASK	=   0x08000000		//FIXME
+ var s_save_spi_init_lo		    =	exec_lo
+ var s_save_spi_init_hi		    =	exec_hi
+ 
+-var s_save_pc_lo	    =	ttmp0		//{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
++var s_save_pc_lo	    =	ttmp0		//{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+ var s_save_pc_hi	    =	ttmp1
+ var s_save_exec_lo	    =	ttmp2
+ var s_save_exec_hi	    =	ttmp3
+-var s_save_tmp		    =	ttmp4
++var s_save_status	    =	ttmp4
+ var s_save_trapsts	    =	ttmp5		//not really used until the end of the SAVE routine
+ var s_save_xnack_mask_lo    =	ttmp6
+ var s_save_xnack_mask_hi    =	ttmp7
+@@ -170,12 +159,11 @@ var s_save_buf_rsrc0	    =	ttmp8
+ var s_save_buf_rsrc1	    =	ttmp9
+ var s_save_buf_rsrc2	    =	ttmp10
+ var s_save_buf_rsrc3	    =	ttmp11
+-var s_save_status	    =	ttmp12
++
+ var s_save_mem_offset	    =	ttmp14
+ var s_save_alloc_size	    =	s_save_trapsts		//conflict
++var s_save_tmp		    =	s_save_buf_rsrc2	//shared with s_save_buf_rsrc2	(conflict: should not use mem access with s_save_tmp at the same time)
+ var s_save_m0		    =	ttmp15
+-var s_save_ttmps_lo	    =	s_save_tmp		//no conflict
+-var s_save_ttmps_hi	    =	s_save_trapsts		//no conflict
+ 
+ /*	Restore	    */
+ var S_RESTORE_BUF_RSRC_WORD1_STRIDE	    =	S_SAVE_BUF_RSRC_WORD1_STRIDE
+@@ -198,7 +186,7 @@ var s_restore_spi_init_hi		    =	exec_hi
+ 
+ var s_restore_mem_offset	=   ttmp12
+ var s_restore_alloc_size	=   ttmp3
+-var s_restore_tmp		=   ttmp2
++var s_restore_tmp		=   ttmp6
+ var s_restore_mem_offset_save	=   s_restore_tmp	//no conflict
+ 
+ var s_restore_m0	    =	s_restore_alloc_size	//no conflict
+@@ -217,8 +205,6 @@ var s_restore_buf_rsrc0	    =	ttmp8
+ var s_restore_buf_rsrc1	    =	ttmp9
+ var s_restore_buf_rsrc2	    =	ttmp10
+ var s_restore_buf_rsrc3	    =	ttmp11
+-var s_restore_ttmps_lo	    =	s_restore_tmp		//no conflict
+-var s_restore_ttmps_hi	    =	s_restore_alloc_size	//no conflict
+ 
+ /**************************************************************************/
+ /*			trap handler entry points			  */
+@@ -249,25 +235,25 @@ L_SKIP_RESTORE:
+     s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)				    //save STATUS since we will change SCC
+     s_andn2_b32	    s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK	    //check whether this is for save
+     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
++    s_and_b32	    ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+     s_cbranch_scc1  L_SAVE					//this is the operation for save
+ 
+     // *********    Handle non-CWSR traps	*******************
+ if (!EMU_RUN_HACK)
+     // Illegal instruction is a non-maskable exception which blocks context save.
+     // Halt the wavefront and return from the trap.
+-    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
++    s_and_b32       ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+     s_cbranch_scc1  L_HALT_WAVE
+ 
+     // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
+     // Instead, halt the wavefront and return from the trap.
+-    s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+-    s_cbranch_scc0  L_FETCH_2ND_TRAP
++    s_and_b32       ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
++    s_cbranch_scc0  L_NO_MEM_VIOL
+ 
+ L_HALT_WAVE:
+     // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
+     // We cannot prevent further faults so just terminate the wavefront.
+-    s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
++    s_and_b32       ttmp8, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+     s_cbranch_scc0  L_NOT_ALREADY_HALTED
+     s_endpgm
+ L_NOT_ALREADY_HALTED:
+@@ -278,31 +264,19 @@ L_NOT_ALREADY_HALTED:
+     s_sub_u32       ttmp0, ttmp0, 0x8
+     s_subb_u32      ttmp1, ttmp1, 0x0
+ 
+-L_FETCH_2ND_TRAP:
+-    // Preserve and clear scalar XNACK state before issuing scalar reads.
+-    // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26].
+-    s_getreg_b32    ttmp2, hwreg(HW_REG_IB_STS)
+-    s_and_b32       ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+-    s_lshl_b32      ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+-    s_andn2_b32     ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK
+-    s_or_b32        ttmp11, ttmp11, ttmp3
+-
+-    s_andn2_b32     ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+-    s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
+-
+-    // Read second-level TBA/TMA from first-level TMA and jump if available.
+-    // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
+-    // ttmp12 holds SQ_WAVE_STATUS
+-    s_getreg_b32    ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
+-    s_getreg_b32    ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
+-    s_lshl_b64      [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
+-    s_load_dwordx2  [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
+-    s_waitcnt       lgkmcnt(0)
+-    s_load_dwordx2  [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
+-    s_waitcnt       lgkmcnt(0)
+-    s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
+-    s_cbranch_scc0  L_NO_NEXT_TRAP // second-level trap handler not been set
+-    s_setpc_b64     [ttmp2, ttmp3] // jump to second-level trap handler
++    s_branch        L_EXCP_CASE
++
++L_NO_MEM_VIOL:
++    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
++    s_getreg_b32    ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO)
++    s_getreg_b32    ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI)
++    s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
++    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0
++    s_waitcnt lgkmcnt(0)
++    s_or_b32	    ttmp7, ttmp8, ttmp9
++    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
++    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
++    s_setpc_b64	    [ttmp8,ttmp9] //jump to next level trap handler
+ 
+ L_NO_NEXT_TRAP:
+     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+@@ -312,18 +286,8 @@ L_NO_NEXT_TRAP:
+     s_addc_u32	ttmp1, ttmp1, 0
+ L_EXCP_CASE:
+     s_and_b32	ttmp1, ttmp1, 0xFFFF
+-
+-    // Restore SQ_WAVE_IB_STS.
+-    s_lshr_b32      ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+-    s_and_b32       ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+-    s_setreg_b32    hwreg(HW_REG_IB_STS), ttmp2
+-
+-    // Restore SQ_WAVE_STATUS.
+-    s_and_b64       exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32
+-    s_and_b64       vcc, vcc, vcc    // Restore STATUS.VCCZ, not writable by s_setreg_b32
+-    set_status_without_spi_prio(s_save_status, ttmp2)
+-
+-    s_rfe_b64       [ttmp0, ttmp1]
++    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
++    s_rfe_b64	    [ttmp0, ttmp1]
+ end
+     // *********	End handling of non-CWSR traps	 *******************
+ 
+@@ -343,6 +307,8 @@ end
+     s_mov_b32	    s_save_tmp, 0							    //clear saveCtx bit
+     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	    //clear saveCtx bit
+ 
++    s_mov_b32	    s_save_xnack_mask_lo,   xnack_mask_lo				    //save XNACK_MASK
++    s_mov_b32	    s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
+     s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)		    //save RCNT
+     s_lshl_b32	    s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+     s_or_b32	    s_save_pc_hi, s_save_pc_hi, s_save_tmp
+@@ -370,10 +336,6 @@ end
+ 	s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+     end
+ 
+-    // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
+-    s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
+-    s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp
+-
+   L_SLEEP:
+     s_sleep 0x2		       // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+ 
+@@ -388,6 +350,7 @@ if G8SR_DEBUG_TIMESTAMP
+ 	s_waitcnt lgkmcnt(0)
+ end
+ 
++    /*	    setup Resource Contants    */
+     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+ 	//calculate wd_addr using absolute thread id
+ 	v_readlane_b32 s_save_tmp, v9, 0
+@@ -405,24 +368,7 @@ end
+     else
+     end
+ 
+-    // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+-    // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+-    get_vgpr_size_bytes(s_save_ttmps_lo)
+-    get_sgpr_size_bytes(s_save_ttmps_hi)
+-    s_add_u32	    s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi
+-    s_add_u32	    s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
+-    s_addc_u32	    s_save_ttmps_hi, s_save_spi_init_hi, 0x0
+-    s_and_b32	    s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
+-    s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
+-    ack_sqc_store_workaround()
+-    s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
+-    ack_sqc_store_workaround()
+-    s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
+-    ack_sqc_store_workaround()
+-    s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
+-    ack_sqc_store_workaround()
+ 
+-    /*	    setup Resource Contants    */
+     s_mov_b32	    s_save_buf_rsrc0,	s_save_spi_init_lo							//base_addr_lo
+     s_and_b32	    s_save_buf_rsrc1,	s_save_spi_init_hi, 0x0000FFFF						//base_addr_hi
+     s_or_b32	    s_save_buf_rsrc1,	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+@@ -479,8 +425,8 @@ end
+     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+     write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)		//TRAPSTS
+ 
+-    write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_LO
+-    write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_HI
++    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_LO
++    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_HI
+ 
+     //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+     s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)						    //MODE
+@@ -556,8 +502,6 @@ end
+     s_mov_b32	    s_save_mem_offset, 0
+     s_mov_b32	    exec_lo, 0xFFFFFFFF						    //need every thread from now on
+     s_mov_b32	    exec_hi, 0xFFFFFFFF
+-    s_mov_b32	    xnack_mask_lo, 0x0
+-    s_mov_b32	    xnack_mask_hi, 0x0
+ 
+     if (SWIZZLE_EN)
+ 	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+@@ -1071,6 +1015,8 @@ end
+ 
+     s_waitcnt	    lgkmcnt(0)											    //from now on, it is safe to restore STATUS and IB_STS
+ 
++    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff	//pc[47:32]	   //Do it here in order not to affect STATUS
++
+     //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+     if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+ 	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8		 //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
+@@ -1092,21 +1038,6 @@ end
+     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+     //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts	   //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+     s_setreg_b32    hwreg(HW_REG_MODE),	    s_restore_mode
+-
+-    // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+-    // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
+-    get_vgpr_size_bytes(s_restore_ttmps_lo)
+-    get_sgpr_size_bytes(s_restore_ttmps_hi)
+-    s_add_u32	    s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi
+-    s_add_u32	    s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
+-    s_addc_u32	    s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
+-    s_and_b32	    s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
+-    s_load_dwordx2  [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
+-    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
+-    s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
+-    s_load_dwordx2  [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
+-    s_waitcnt	    lgkmcnt(0)
+-
+     //reuse s_restore_m0 as a temp register
+     s_and_b32	    s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+     s_lshr_b32	    s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+@@ -1121,12 +1052,11 @@ end
+     s_lshr_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+     s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+ 
+-    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff	//pc[47:32]	   //Do it here in order not to affect STATUS
+     s_and_b64	 exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+     s_and_b64	 vcc, vcc, vcc	// Restore STATUS.VCCZ, not writable by s_setreg_b32
+-    set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu
++    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status	 // SCC is included, which is changed by previous salu
+ 
+-    s_barrier							//barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
++    s_barrier							//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+ 
+ if G8SR_DEBUG_TIMESTAMP
+     s_memrealtime s_g8sr_ts_restore_d
+@@ -1155,7 +1085,9 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+ 	s_mov_b32 exec_lo, m0			//assuming exec_lo is not needed anymore from this point on
+ 	s_mov_b32 m0, s_mem_offset
+ 	s_buffer_store_dword s, s_rsrc, m0	glc:1
+-	ack_sqc_store_workaround()
++if ACK_SQC_STORE
++	s_waitcnt lgkmcnt(0)
++end
+ 	s_add_u32	s_mem_offset, s_mem_offset, 4
+ 	s_mov_b32   m0, exec_lo
+ end
+@@ -1165,13 +1097,21 @@ end
+ function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+ 
+ 	s_buffer_store_dwordx4 s[0], s_rsrc, 0	glc:1
+-	ack_sqc_store_workaround()
++if ACK_SQC_STORE
++	s_waitcnt lgkmcnt(0)
++end
+ 	s_buffer_store_dwordx4 s[4], s_rsrc, 16	 glc:1
+-	ack_sqc_store_workaround()
++if ACK_SQC_STORE
++	s_waitcnt lgkmcnt(0)
++end
+ 	s_buffer_store_dwordx4 s[8], s_rsrc, 32	 glc:1
+-	ack_sqc_store_workaround()
++if ACK_SQC_STORE
++	s_waitcnt lgkmcnt(0)
++end
+ 	s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+-	ack_sqc_store_workaround()
++if ACK_SQC_STORE
++	s_waitcnt lgkmcnt(0)
++end
+ 	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+ 	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0	      // +scc
+ end
+@@ -1211,16 +1151,261 @@ function get_hwreg_size_bytes
+     return 128 //HWREG size 128 bytes
+ end
+ 
+-function ack_sqc_store_workaround
+-    if ACK_SQC_STORE
+-        s_waitcnt lgkmcnt(0)
+-    end
+-end
+ 
+-function set_status_without_spi_prio(status, tmp)
+-    // Do not restore STATUS.SPI_PRIO since scheduler may have raised it.
+-    s_lshr_b32      tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp
+-    s_nop           0x2 // avoid S_SETREG => S_SETREG hazard
+-    s_setreg_b32    hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status
+-end
++
++#endif
++
++static const uint32_t cwsr_trap_gfx9_hex[] = {
++	0xbf820001, 0xbf820130,
++	0xb8f0f802, 0x89708670,
++	0xb8f1f803, 0x8674ff71,
++	0x00000400, 0xbf850023,
++	0x8674ff71, 0x00000800,
++	0xbf850003, 0x8674ff71,
++	0x00000100, 0xbf840009,
++	0x8674ff70, 0x00002000,
++	0xbf840001, 0xbf810000,
++	0x8770ff70, 0x00002000,
++	0x80ec886c, 0x82ed806d,
++	0xbf820010, 0xb8faf812,
++	0xb8fbf813, 0x8efa887a,
++	0xc00a1d3d, 0x00000000,
++	0xbf8cc07f, 0x87737574,
++	0xbf840002, 0xb970f802,
++	0xbe801d74, 0xb8f1f803,
++	0x8671ff71, 0x000001ff,
++	0xbf850002, 0x806c846c,
++	0x826d806d, 0x866dff6d,
++	0x0000ffff, 0xb970f802,
++	0xbe801f6c, 0x866dff6d,
++	0x0000ffff, 0xbef60080,
++	0xb9760283, 0xbef20068,
++	0xbef30069, 0xb8f62407,
++	0x8e769c76, 0x876d766d,
++	0xb8f603c7, 0x8e769b76,
++	0x876d766d, 0xb8f6f807,
++	0x8676ff76, 0x00007fff,
++	0xb976f807, 0xbeee007e,
++	0xbeef007f, 0xbefe0180,
++	0xbf900004, 0xbf8e0002,
++	0xbf88fffe, 0xbef4007e,
++	0x8675ff7f, 0x0000ffff,
++	0x8775ff75, 0x00040000,
++	0xbef60080, 0xbef700ff,
++	0x00807fac, 0x8676ff7f,
++	0x08000000, 0x8f768376,
++	0x87777677, 0x8676ff7f,
++	0x70000000, 0x8f768176,
++	0x87777677, 0xbefb007c,
++	0xbefa0080, 0xb8fa2a05,
++	0x807a817a, 0x8e7a8a7a,
++	0xb8f61605, 0x80768176,
++	0x8e768676, 0x807a767a,
++	0xbef60084, 0xbef600ff,
++	0x01000000, 0xbefe007c,
++	0xbefc007a, 0xc0611efa,
++	0x0000007c, 0xbf8cc07f,
++	0x807a847a, 0xbefc007e,
++	0xbefe007c, 0xbefc007a,
++	0xc0611b3a, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0xbefe007c,
++	0xbefc007a, 0xc0611b7a,
++	0x0000007c, 0xbf8cc07f,
++	0x807a847a, 0xbefc007e,
++	0xbefe007c, 0xbefc007a,
++	0xc0611bba, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0xbefe007c,
++	0xbefc007a, 0xc0611bfa,
++	0x0000007c, 0xbf8cc07f,
++	0x807a847a, 0xbefc007e,
++	0xbefe007c, 0xbefc007a,
++	0xc0611c3a, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0xb8f1f803,
++	0xbefe007c, 0xbefc007a,
++	0xc0611c7a, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0xbefe007c,
++	0xbefc007a, 0xc0611cba,
++	0x0000007c, 0xbf8cc07f,
++	0x807a847a, 0xbefc007e,
++	0xbefe007c, 0xbefc007a,
++	0xc0611cfa, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0xb8fbf801,
++	0xbefe007c, 0xbefc007a,
++	0xc0611efa, 0x0000007c,
++	0xbf8cc07f, 0x807a847a,
++	0xbefc007e, 0x8676ff7f,
++	0x04000000, 0xbeef0080,
++	0x876f6f76, 0xb8fa2a05,
++	0x807a817a, 0x8e7a8a7a,
++	0xb8f11605, 0x80718171,
++	0x8e718471, 0x8e768271,
++	0xbef600ff, 0x01000000,
++	0xbef20174, 0x80747a74,
++	0x82758075, 0xbefc0080,
++	0xbf800000, 0xbe802b00,
++	0xbe822b02, 0xbe842b04,
++	0xbe862b06, 0xbe882b08,
++	0xbe8a2b0a, 0xbe8c2b0c,
++	0xbe8e2b0e, 0xc06b003a,
++	0x00000000, 0xbf8cc07f,
++	0xc06b013a, 0x00000010,
++	0xbf8cc07f, 0xc06b023a,
++	0x00000020, 0xbf8cc07f,
++	0xc06b033a, 0x00000030,
++	0xbf8cc07f, 0x8074c074,
++	0x82758075, 0x807c907c,
++	0xbf0a717c, 0xbf85ffe7,
++	0xbef40172, 0xbefa0080,
++	0xbefe00c1, 0xbeff00c1,
++	0xbef600ff, 0x01000000,
++	0xe0724000, 0x7a1d0000,
++	0xe0724100, 0x7a1d0100,
++	0xe0724200, 0x7a1d0200,
++	0xe0724300, 0x7a1d0300,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8f14306, 0x8671c171,
++	0xbf84002c, 0xbf8a0000,
++	0x8676ff6f, 0x04000000,
++	0xbf840028, 0x8e718671,
++	0x8e718271, 0xbef60071,
++	0xb8fa2a05, 0x807a817a,
++	0x8e7a8a7a, 0xb8f61605,
++	0x80768176, 0x8e768676,
++	0x807a767a, 0x807aff7a,
++	0x00000080, 0xbef600ff,
++	0x01000000, 0xbefc0080,
++	0xd28c0002, 0x000100c1,
++	0xd28d0003, 0x000204c1,
++	0xd1060002, 0x00011103,
++	0x7e0602ff, 0x00000200,
++	0xbefc00ff, 0x00010000,
++	0xbe800077, 0x8677ff77,
++	0xff7fffff, 0x8777ff77,
++	0x00058000, 0xd8ec0000,
++	0x00000002, 0xbf8cc07f,
++	0xe0765000, 0x7a1d0002,
++	0x68040702, 0xd0c9006a,
++	0x0000e302, 0xbf87fff7,
++	0xbef70000, 0xbefa00ff,
++	0x00000400, 0xbefe00c1,
++	0xbeff00c1, 0xb8f12a05,
++	0x80718171, 0x8e718271,
++	0x8e768871, 0xbef600ff,
++	0x01000000, 0xbefc0084,
++	0xbf0a717c, 0xbf840015,
++	0xbf11017c, 0x8071ff71,
++	0x00001000, 0x7e000300,
++	0x7e020301, 0x7e040302,
++	0x7e060303, 0xe0724000,
++	0x7a1d0000, 0xe0724100,
++	0x7a1d0100, 0xe0724200,
++	0x7a1d0200, 0xe0724300,
++	0x7a1d0300, 0x807c847c,
++	0x807aff7a, 0x00000400,
++	0xbf0a717c, 0xbf85ffef,
++	0xbf9c0000, 0xbf8200c5,
++	0xbef4007e, 0x8675ff7f,
++	0x0000ffff, 0x8775ff75,
++	0x00040000, 0xbef60080,
++	0xbef700ff, 0x00807fac,
++	0x8672ff7f, 0x08000000,
++	0x8f728372, 0x87777277,
++	0x8672ff7f, 0x70000000,
++	0x8f728172, 0x87777277,
++	0x8672ff7f, 0x04000000,
++	0xbf84001e, 0xbefe00c1,
++	0xbeff00c1, 0xb8ef4306,
++	0x866fc16f, 0xbf840019,
++	0x8e6f866f, 0x8e6f826f,
++	0xbef6006f, 0xb8f82a05,
++	0x80788178, 0x8e788a78,
++	0xb8f21605, 0x80728172,
++	0x8e728672, 0x80787278,
++	0x8078ff78, 0x00000080,
++	0xbef600ff, 0x01000000,
++	0xbefc0080, 0xe0510000,
++	0x781d0000, 0xe0510100,
++	0x781d0000, 0x807cff7c,
++	0x00000200, 0x8078ff78,
++	0x00000200, 0xbf0a6f7c,
++	0xbf85fff6, 0xbef80080,
++	0xbefe00c1, 0xbeff00c1,
++	0xb8ef2a05, 0x806f816f,
++	0x8e6f826f, 0x8e76886f,
++	0xbef600ff, 0x01000000,
++	0xbef20078, 0x8078ff78,
++	0x00000400, 0xbefc0084,
++	0xbf11087c, 0x806fff6f,
++	0x00008000, 0xe0524000,
++	0x781d0000, 0xe0524100,
++	0x781d0100, 0xe0524200,
++	0x781d0200, 0xe0524300,
++	0x781d0300, 0xbf8c0f70,
++	0x7e000300, 0x7e020301,
++	0x7e040302, 0x7e060303,
++	0x807c847c, 0x8078ff78,
++	0x00000400, 0xbf0a6f7c,
++	0xbf85ffee, 0xbf9c0000,
++	0xe0524000, 0x721d0000,
++	0xe0524100, 0x721d0100,
++	0xe0524200, 0x721d0200,
++	0xe0524300, 0x721d0300,
++	0xb8f82a05, 0x80788178,
++	0x8e788a78, 0xb8f21605,
++	0x80728172, 0x8e728672,
++	0x80787278, 0x80f8c078,
++	0xb8ef1605, 0x806f816f,
++	0x8e6f846f, 0x8e76826f,
++	0xbef600ff, 0x01000000,
++	0xbefc006f, 0xc031003a,
++	0x00000078, 0x80f8c078,
++	0xbf8cc07f, 0x80fc907c,
++	0xbf800000, 0xbe802d00,
++	0xbe822d02, 0xbe842d04,
++	0xbe862d06, 0xbe882d08,
++	0xbe8a2d0a, 0xbe8c2d0c,
++	0xbe8e2d0e, 0xbf06807c,
++	0xbf84fff0, 0xb8f82a05,
++	0x80788178, 0x8e788a78,
++	0xb8f21605, 0x80728172,
++	0x8e728672, 0x80787278,
++	0xbef60084, 0xbef600ff,
++	0x01000000, 0xc0211bfa,
++	0x00000078, 0x80788478,
++	0xc0211b3a, 0x00000078,
++	0x80788478, 0xc0211b7a,
++	0x00000078, 0x80788478,
++	0xc0211eba, 0x00000078,
++	0x80788478, 0xc0211efa,
++	0x00000078, 0x80788478,
++	0xc0211c3a, 0x00000078,
++	0x80788478, 0xc0211c7a,
++	0x00000078, 0x80788478,
++	0xc0211a3a, 0x00000078,
++	0x80788478, 0xc0211a7a,
++	0x00000078, 0x80788478,
++	0xc0211cfa, 0x00000078,
++	0x80788478, 0xbf8cc07f,
++	0x866dff6d, 0x0000ffff,
++	0xbefc006f, 0xbefe007a,
++	0xbeff007b, 0x866f71ff,
++	0x000003ff, 0xb96f4803,
++	0x866f71ff, 0xfffff800,
++	0x8f6f8b6f, 0xb96fa2c3,
++	0xb973f801, 0x866fff6d,
++	0xf0000000, 0x8f6f9c6f,
++	0x8e6f906f, 0xbef20080,
++	0x87726f72, 0x866fff6d,
++	0x08000000, 0x8f6f9b6f,
++	0x8e6f8f6f, 0x87726f72,
++	0x866fff70, 0x00800000,
++	0x8f6f976f, 0xb972f807,
++	0x86fe7e7e, 0x86ea6a6a,
++	0xb970f802, 0xbf8a0000,
++	0x95806f6c, 0xbf810000,
++};
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 56c1230..01c8b19 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -24,7 +24,6 @@
+ #include <linux/export.h>
+ #include <linux/err.h>
+ #include <linux/fs.h>
+-#include <linux/file.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+ #include <linux/slab.h>
+@@ -36,7 +35,6 @@
+ #include <linux/mman.h>
+ #include <asm/processor.h>
+ #include <linux/ptrace.h>
+-#include <linux/pagemap.h>
+ 
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+@@ -46,6 +44,7 @@
+ static long kfd_ioctl(struct file *, unsigned int, unsigned long);
+ static int kfd_open(struct inode *, struct file *);
+ static int kfd_mmap(struct file *, struct vm_area_struct *);
++static bool kfd_dev_is_large_bar(struct kfd_dev *dev);
+ 
+ static const char kfd_dev_name[] = "kfd";
+ 
+@@ -137,9 +136,6 @@ static int kfd_open(struct inode *inode, struct file *filep)
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+-	if (kfd_is_locked())
+-		return -EAGAIN;
+-
+ 	dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
+ 		process->pasid, process->is_32bit_user_mode);
+ 
+@@ -251,7 +247,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
+ 	pr_debug("Queue Size: 0x%llX, %u\n",
+ 			q_properties->queue_size, args->ring_size);
+ 
+-	pr_debug("Queue r/w Pointers: %px, %px\n",
++	pr_debug("Queue r/w Pointers: %p, %p\n",
+ 			q_properties->read_ptr,
+ 			q_properties->write_ptr);
+ 
+@@ -903,7 +899,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+ 		mutex_lock(&p->mutex);
+ 
+ 		if (!kfd_has_process_device_data(p))
+-			goto out_unlock;
++			goto out_upwrite;
+ 
+ 		/* Run over all pdd of the process */
+ 		pdd = kfd_get_first_process_device_data(p);
+@@ -912,7 +908,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+ 			pdd = kfd_get_next_process_device_data(p, pdd);
+ 		} while (pdd);
+ 
+-		goto out_unlock;
++		goto out_upwrite;
+ 	}
+ 
+ 	/* Fill in process-aperture information for all available
+@@ -929,7 +925,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+ 	if (!kfd_has_process_device_data(p)) {
+ 		args->num_of_nodes = 0;
+ 		kfree(pa);
+-		goto out_unlock;
++		goto out_upwrite;
+ 	}
+ 
+ 	/* Run over all pdd of the process */
+@@ -971,7 +967,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+ 	kfree(pa);
+ 	return ret ? -EFAULT : 0;
+ 
+-out_unlock:
++out_upwrite:
+ 	mutex_unlock(&p->mutex);
+ 	return 0;
+ }
+@@ -980,70 +976,55 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
+ 					void *data)
+ {
+ 	struct kfd_ioctl_create_event_args *args = data;
+-	int err;
+-
+-	/* For dGPUs the event page is allocated in user mode. The
+-	 * handle is passed to KFD with the first call to this IOCTL
+-	 * through the event_page_offset field.
+-	 */
+-	if (args->event_page_offset) {
+-		struct kfd_dev *kfd;
+-		struct kfd_process_device *pdd;
+-		void *mem, *kern_addr;
+-		uint64_t size;
++	struct kfd_dev *kfd;
++	struct kfd_process_device *pdd;
++	int err = -EINVAL;
++	void *mem, *kern_addr = NULL;
+ 
+-		if (p->signal_page) {
+-			pr_err("Event page is already set\n");
+-			return -EINVAL;
+-		}
++	pr_debug("Event page offset 0x%llx\n", args->event_page_offset);
+ 
++	if (args->event_page_offset) {
+ 		kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
+ 		if (!kfd) {
+ 			pr_err("Getting device by id failed in %s\n", __func__);
+-			return -EINVAL;
++			return -EFAULT;
+ 		}
+-
+-		mutex_lock(&p->mutex);
+-		pdd = kfd_bind_process_to_device(kfd, p);
+-		if (IS_ERR(pdd)) {
+-			err = PTR_ERR(pdd);
+-			goto out_unlock;
+-		}
+-
+-		mem = kfd_process_device_translate_handle(pdd,
++		if (!kfd->device_info->needs_iommu_device) {
++			mutex_lock(&p->mutex);
++			pdd = kfd_bind_process_to_device(kfd, p);
++			if (IS_ERR(pdd)) {
++				err = PTR_ERR(pdd);
++				goto out_upwrite;
++			}
++			mem = kfd_process_device_translate_handle(pdd,
+ 				GET_IDR_HANDLE(args->event_page_offset));
+-		if (!mem) {
+-			pr_err("Can't find BO, offset is 0x%llx\n",
+-			       args->event_page_offset);
+-			err = -EINVAL;
+-			goto out_unlock;
+-		}
+-		mutex_unlock(&p->mutex);
+-
+-		err = kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
+-						mem, &kern_addr, &size);
+-		if (err) {
+-			pr_err("Failed to map event page to kernel\n");
+-			return err;
+-		}
++			if (!mem) {
++				pr_err("Can't find BO, offset is 0x%llx\n",
++						args->event_page_offset);
++				err = -EFAULT;
++				goto out_upwrite;
++			}
++			mutex_unlock(&p->mutex);
+ 
+-		err = kfd_event_page_set(p, kern_addr, size);
+-		if (err) {
+-			pr_err("Failed to set event page\n");
+-			return err;
++			/* Map dGPU gtt BO to kernel */
++			kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
++					mem, &kern_addr, NULL);
+ 		}
+ 	}
+ 
+-
+-	err = kfd_event_create(filp, p, args->event_type,
+-				args->auto_reset != 0, args->node_id,
+-				&args->event_id, &args->event_trigger_data,
+-				&args->event_page_offset,
+-				&args->event_slot_index);
++	err = kfd_event_create(filp, p,
++			args->event_type,
++			args->auto_reset != 0,
++			args->node_id,
++			&args->event_id,
++			&args->event_trigger_data,
++			&args->event_page_offset,
++			&args->event_slot_index,
++			kern_addr);
+ 
+ 	return err;
+ 
+-out_unlock:
++out_upwrite:
+ 	mutex_unlock(&p->mutex);
+ 	return err;
+ }
+@@ -1085,14 +1066,17 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
+ 
+ 	return err;
+ }
+-static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
++static int kfd_ioctl_alloc_scratch_memory(struct file *filep,
+ 					struct kfd_process *p, void *data)
+ {
+-	struct kfd_ioctl_set_scratch_backing_va_args *args = data;
++	struct kfd_ioctl_alloc_memory_of_scratch_args *args = data;
+ 	struct kfd_process_device *pdd;
+ 	struct kfd_dev *dev;
+ 	long err;
+ 
++	if (args->size == 0)
++		return -EINVAL;
++
+ 	dev = kfd_device_by_id(args->gpu_id);
+ 	if (!dev)
+ 		return -EINVAL;
+@@ -1242,8 +1226,6 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+ 	uint64_t offset = args->mmap_offset;
+ 	uint32_t flags = args->flags;
+ 	struct vm_area_struct *vma;
+-	uint64_t cpuva = 0;
+-	unsigned int mem_type = 0;
+ 
+ 	if (args->size == 0)
+ 		return -EINVAL;
+@@ -1273,13 +1255,6 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+ 			flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL;
+ 			flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR;
+ 			offset = (pfn << PAGE_SHIFT);
+-		} else {
+-			if (offset & (PAGE_SIZE - 1)) {
+-				pr_debug("Unaligned userptr address:%llx\n",
+-					 offset);
+-				return -EINVAL;
+-			}
+-			cpuva = offset;
+ 		}
+ 	} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
+ 		if (args->size != kfd_doorbell_process_slice(dev))
+@@ -1297,18 +1272,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+ 
+ 	err = dev->kfd2kgd->alloc_memory_of_gpu(
+ 		dev->kgd, args->va_addr, args->size,
+-		pdd->vm, NULL, (struct kgd_mem **) &mem, &offset,
++		pdd->vm, (struct kgd_mem **) &mem, &offset,
+ 		flags);
+ 
+ 	if (err)
+ 		goto err_unlock;
+ 
+-	mem_type = flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_GTT |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL);
+ 	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+-			args->va_addr, args->size, cpuva, mem_type, NULL);
++			args->va_addr, args->size, NULL);
+ 	if (idr_handle < 0) {
+ 		err = -EFAULT;
+ 		goto err_free;
+@@ -1322,7 +1293,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+ 	return 0;
+ 
+ err_free:
+-	dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem);
++	dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
++					 (struct kgd_mem *) mem);
+ err_unlock:
+ 	mutex_unlock(&p->mutex);
+ 	return err;
+@@ -1363,7 +1335,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
+ 	/* If freeing the buffer failed, leave the handle in place for
+ 	 * clean-up during process tear-down.
+ 	 */
+-	if (!ret)
++	if (ret == 0)
+ 		kfd_process_device_remove_obj_handle(
+ 			pdd, GET_IDR_HANDLE(args->handle));
+ 
+@@ -1380,30 +1352,31 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+ 	void *mem;
+ 	struct kfd_dev *dev, *peer;
+ 	long err = 0;
+-	int i;
++	int i, num_dev = 0;
+ 	uint32_t *devices_arr = NULL;
+ 
+ 	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+ 	if (!dev)
+ 		return -EINVAL;
+ 
+-	if (!args->n_devices) {
+-		pr_debug("Device IDs array empty\n");
++	if (args->device_ids_array_size == 0) {
++		pr_debug("Device ID array size is 0\n");
+ 		return -EINVAL;
+ 	}
+-	if (args->n_success > args->n_devices) {
+-		pr_debug("n_success exceeds n_devices\n");
++
++	if (args->device_ids_array_size % sizeof(uint32_t)) {
++		pr_debug("Node IDs array size %u\n",
++				args->device_ids_array_size);
+ 		return -EINVAL;
+ 	}
+ 
+-	devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+-				    GFP_KERNEL);
++	devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+ 	if (!devices_arr)
+ 		return -ENOMEM;
+ 
+ 	err = copy_from_user(devices_arr,
+-			     (void __user *)args->device_ids_array_ptr,
+-			     args->n_devices * sizeof(*devices_arr));
++			(void __user *)args->device_ids_array_ptr,
++			args->device_ids_array_size);
+ 	if (err != 0) {
+ 		err = -EFAULT;
+ 		goto copy_from_user_failed;
+@@ -1424,11 +1397,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+ 		goto get_mem_obj_from_handle_failed;
+ 	}
+ 
+-	for (i = args->n_success; i < args->n_devices; i++) {
++	num_dev = args->device_ids_array_size / sizeof(uint32_t);
++	for (i = 0 ; i < num_dev; i++) {
+ 		peer = kfd_device_by_id(devices_arr[i]);
+ 		if (!peer) {
+ 			pr_debug("Getting device by id failed for 0x%x\n",
+-				 devices_arr[i]);
++					devices_arr[i]);
+ 			err = -EINVAL;
+ 			goto get_mem_obj_from_handle_failed;
+ 		}
+@@ -1439,13 +1413,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+ 			goto get_mem_obj_from_handle_failed;
+ 		}
+ 		err = peer->kfd2kgd->map_memory_to_gpu(
+-			peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
+-		if (err) {
+-			pr_err("Failed to map to gpu %d/%d\n",
+-			       i, args->n_devices);
++				peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
++		if (err != 0) {
++			pr_err("Failed to map to gpu %d, num_dev=%d\n",
++					i, num_dev);
+ 			goto map_memory_to_gpu_failed;
+ 		}
+-		args->n_success = i+1;
+ 	}
+ 
+ 	mutex_unlock(&p->mutex);
+@@ -1457,7 +1430,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+ 	}
+ 
+ 	/* Flush TLBs after waiting for the page table updates to complete */
+-	for (i = 0; i < args->n_devices; i++) {
++	for (i = 0; i < num_dev; i++) {
+ 		peer = kfd_device_by_id(devices_arr[i]);
+ 		if (WARN_ON_ONCE(!peer))
+ 			continue;
+@@ -1490,29 +1463,30 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ 	void *mem;
+ 	struct kfd_dev *dev, *peer;
+ 	long err = 0;
+-	uint32_t *devices_arr = NULL, i;
++	uint32_t *devices_arr = NULL, num_dev, i;
+ 
+ 	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+ 	if (!dev)
+ 		return -EINVAL;
+ 
+-	if (!args->n_devices) {
+-		pr_debug("Device IDs array empty\n");
++	if (args->device_ids_array_size == 0) {
++		pr_debug("Device ID array size is 0\n");
+ 		return -EINVAL;
+ 	}
+-	if (args->n_success > args->n_devices) {
+-		pr_debug("n_success exceeds n_devices\n");
++
++	if (args->device_ids_array_size % sizeof(uint32_t)) {
++		pr_debug("Node IDs array size %u\n",
++				args->device_ids_array_size);
+ 		return -EINVAL;
+ 	}
+ 
+-	devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr),
+-				    GFP_KERNEL);
++	devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+ 	if (!devices_arr)
+ 		return -ENOMEM;
+ 
+ 	err = copy_from_user(devices_arr,
+-			     (void __user *)args->device_ids_array_ptr,
+-			     args->n_devices * sizeof(*devices_arr));
++			(void __user *)args->device_ids_array_ptr,
++			args->device_ids_array_size);
+ 	if (err != 0) {
+ 		err = -EFAULT;
+ 		goto copy_from_user_failed;
+@@ -1522,7 +1496,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ 
+ 	pdd = kfd_get_process_device_data(dev, p);
+ 	if (!pdd) {
+-		err = -EINVAL;
++		pr_debug("Process device data doesn't exist\n");
++		err = -ENODEV;
+ 		goto bind_process_to_device_failed;
+ 	}
+ 
+@@ -1533,7 +1508,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ 		goto get_mem_obj_from_handle_failed;
+ 	}
+ 
+-	for (i = args->n_success; i < args->n_devices; i++) {
++	num_dev = args->device_ids_array_size / sizeof(uint32_t);
++	for (i = 0 ; i < num_dev; i++) {
+ 		peer = kfd_device_by_id(devices_arr[i]);
+ 		if (!peer) {
+ 			err = -EINVAL;
+@@ -1549,10 +1525,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ 			peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
+ 		if (err) {
+ 			pr_err("Failed to unmap from gpu %d/%d\n",
+-			       i, args->n_devices);
++			       i, num_dev);
+ 			goto unmap_memory_from_gpu_failed;
+ 		}
+-		args->n_success = i+1;
+ 	}
+ 	kfree(devices_arr);
+ 
+@@ -1569,6 +1544,34 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+ 	return err;
+ }
+ 
++static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep,
++		struct kfd_process *p, void *data)
++{
++	struct kfd_ioctl_set_process_dgpu_aperture_args *args = data;
++	struct kfd_dev *dev;
++	struct kfd_process_device *pdd;
++	long err;
++
++	dev = kfd_device_by_id(args->gpu_id);
++	if (!dev)
++		return -EINVAL;
++
++	mutex_lock(&p->mutex);
++
++	pdd = kfd_bind_process_to_device(dev, p);
++	if (IS_ERR(pdd)) {
++		err = PTR_ERR(pdd);
++		goto exit;
++	}
++
++	err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base,
++			args->dgpu_limit);
++
++exit:
++	mutex_unlock(&p->mutex);
++	return err;
++}
++
+ static int kfd_ioctl_get_dmabuf_info(struct file *filep,
+ 		struct kfd_process *p, void *data)
+ {
+@@ -1683,636 +1686,22 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep,
+ 	return r;
+ }
+ 
+-/* Maximum number of entries for process pages array which lives on stack */
+-#define MAX_PP_STACK_COUNT 16
+-/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+-#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+-#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *))
+-
+-static void kfd_put_sg_table(struct sg_table *sg)
+-{
+-	unsigned int i;
+-	struct scatterlist *s;
+-
+-	for_each_sg(sg->sgl, s, sg->nents, i)
+-		put_page(sg_page(s));
+-}
+-
+-
+-/* Create a sg table for the given userptr BO by pinning its system pages
+- * @bo: userptr BO
+- * @offset: Offset into BO
+- * @mm/@task: mm_struct & task_struct of the process that holds the BO
+- * @size: in/out: desired size / actual size which could be smaller
+- * @sg_size: out: Size of sg table. This is ALIGN_UP(@size)
+- * @ret_sg: out sg table
+- */
+-static int kfd_create_sg_table_from_userptr_bo(struct kfd_bo *bo,
+-					       int64_t offset, int cma_write,
+-					       struct mm_struct *mm,
+-					       struct task_struct *task,
+-					       uint64_t *size,
+-					       uint64_t *sg_size,
+-					       struct sg_table **ret_sg)
+-{
+-	int ret, locked = 1;
+-	struct sg_table *sg = NULL;
+-	unsigned int i, offset_in_page, flags = 0;
+-	unsigned long nents, n;
+-	unsigned long pa = (bo->cpuva + offset) & PAGE_MASK;
+-	unsigned int cur_page = 0;
+-	struct scatterlist *s;
+-	uint64_t sz = *size;
+-	struct page **process_pages;
+-
+-	*sg_size = 0;
+-	sg = kmalloc(sizeof(*sg), GFP_KERNEL);
+-	if (!sg)
+-		return -ENOMEM;
+-
+-	offset_in_page = offset & (PAGE_SIZE - 1);
+-	nents = (sz + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE;
+-
+-	ret = sg_alloc_table(sg, nents, GFP_KERNEL);
+-	if (unlikely(ret)) {
+-		ret = -ENOMEM;
+-		goto sg_alloc_fail;
+-	}
+-	process_pages = kmalloc_array(nents, sizeof(struct pages *),
+-				      GFP_KERNEL);
+-	if (!process_pages) {
+-		ret = -ENOMEM;
+-		goto page_alloc_fail;
+-	}
+-
+-	if (cma_write)
+-		flags = FOLL_WRITE;
+-	locked = 1;
+-	down_read(&mm->mmap_sem);
+-	n = get_user_pages_remote(task, mm, pa, nents, flags, process_pages,
+-				  NULL, &locked);
+-	if (locked)
+-		up_read(&mm->mmap_sem);
+-	if (n <= 0) {
+-		pr_err("CMA: Invalid virtual address 0x%lx\n", pa);
+-		ret = -EFAULT;
+-		goto get_user_fail;
+-	}
+-	if (n != nents) {
+-		/* Pages pinned < requested. Set the size accordingly */
+-		*size = (n * PAGE_SIZE) - offset_in_page;
+-		pr_debug("Requested %lx but pinned %lx\n", nents, n);
+-	}
+-
+-	sz = 0;
+-	for_each_sg(sg->sgl, s, n, i) {
+-		sg_set_page(s, process_pages[cur_page], PAGE_SIZE,
+-			    offset_in_page);
+-		sg_dma_address(s) = page_to_phys(process_pages[cur_page]);
+-		offset_in_page = 0;
+-		cur_page++;
+-		sz += PAGE_SIZE;
+-	}
+-	*ret_sg = sg;
+-	*sg_size = sz;
+-
+-	kfree(process_pages);
+-	return 0;
+-
+-get_user_fail:
+-	kfree(process_pages);
+-page_alloc_fail:
+-	sg_free_table(sg);
+-sg_alloc_fail:
+-	kfree(sg);
+-	return ret;
+-}
+-
+-static void kfd_free_cma_bos(struct cma_iter *ci)
+-{
+-	struct cma_system_bo *cma_bo, *tmp;
+-
+-	list_for_each_entry_safe(cma_bo, tmp, &ci->cma_list, list) {
+-		struct kfd_dev *dev = cma_bo->dev;
+-
+-		/* sg table is deleted by free_memory_of_gpu */
+-		if (cma_bo->sg)
+-			kfd_put_sg_table(cma_bo->sg);
+-		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem);
+-		list_del(&cma_bo->list);
+-		kfree(cma_bo);
+-	}
+-}
+-
+-/* 1 second timeout */
+-#define CMA_WAIT_TIMEOUT msecs_to_jiffies(1000)
+-
+-static int kfd_cma_fence_wait(struct dma_fence *f)
+-{
+-	int ret;
+-
+-	ret = dma_fence_wait_timeout(f, false, CMA_WAIT_TIMEOUT);
+-	if (likely(ret > 0))
+-		return 0;
+-	if (!ret)
+-		ret = -ETIME;
+-	return ret;
+-}
+-
+-/* Put previous (old) fence @pf but it waits for @pf to signal if the context
+- * of the current fence @cf is different.
+- */
+-static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf,
+-					      struct dma_fence *pf)
+-{
+-	int ret = 0;
+-
+-	if (pf && cf && cf->context != pf->context)
+-		ret = kfd_cma_fence_wait(pf);
+-	dma_fence_put(pf);
+-	return ret;
+-}
+-
+-#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE)
+-
+-/* Create an equivalent system BO for the given @bo. If @bo is a userptr then
+- * create a new system BO by pinning underlying system pages of the given
+- * userptr BO. If @bo is in Local Memory then create an empty system BO and
+- * then copy @bo into this new BO.
+- * @bo: Userptr BO or Local Memory BO
+- * @offset: Offset into bo
+- * @size: in/out: The size of the new BO could be less than requested if all
+- *        the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would
+- *        be reflected in @size
+- * @mm/@task: mm/task to which @bo belongs to
+- * @cma_bo: out: new system BO
+- */
+-static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo,
+-				    uint64_t *size, uint64_t offset,
+-				    int cma_write, struct kfd_process *p,
+-				    struct mm_struct *mm,
+-				    struct task_struct *task,
+-				    struct cma_system_bo **cma_bo)
+-{
+-	int ret;
+-	struct kfd_process_device *pdd = NULL;
+-	struct cma_system_bo *cbo;
+-	uint64_t bo_size = 0;
+-	struct dma_fence *f;
+-
+-	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_WRITABLE |
+-			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
+-
+-	*cma_bo = NULL;
+-	cbo = kzalloc(sizeof(**cma_bo), GFP_KERNEL);
+-	if (!cbo)
+-		return -ENOMEM;
+-
+-	INIT_LIST_HEAD(&cbo->list);
+-	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+-		bo_size = min_t(uint64_t, *size, MAX_SYSTEM_BO_SIZE);
+-	else if (bo->cpuva) {
+-		ret = kfd_create_sg_table_from_userptr_bo(bo, offset,
+-							  cma_write, mm, task,
+-							  size, &bo_size,
+-							  &cbo->sg);
+-		if (ret) {
+-			pr_err("CMA: BO create with sg failed %d\n", ret);
+-			goto sg_fail;
+-		}
+-	} else {
+-		WARN_ON(1);
+-		ret = -EINVAL;
+-		goto sg_fail;
+-	}
+-	mutex_lock(&p->mutex);
+-	pdd = kfd_get_process_device_data(kdev, p);
+-	if (!pdd) {
+-		mutex_unlock(&p->mutex);
+-		pr_err("Process device data doesn't exist\n");
+-		ret = -EINVAL;
+-		goto pdd_fail;
+-	}
+-
+-	ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size,
+-						 pdd->vm, cbo->sg,
+-						 &cbo->mem, NULL, flags);
+-	mutex_unlock(&p->mutex);
+-	if (ret) {
+-		pr_err("Failed to create shadow system BO %d\n", ret);
+-		goto pdd_fail;
+-	}
+-
+-	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+-		ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem,
+-						     offset, cbo->mem, 0,
+-						     bo_size, &f, size);
+-		if (ret) {
+-			pr_err("CMA: Intermediate copy failed %d\n", ret);
+-			goto copy_fail;
+-		}
+-
+-		/* Wait for the copy to finish as subsequent copy will be done
+-		 * by different device
+-		 */
+-		ret = kfd_cma_fence_wait(f);
+-		dma_fence_put(f);
+-		if (ret) {
+-			pr_err("CMA: Intermediate copy timed out %d\n", ret);
+-			goto copy_fail;
+-		}
+-	}
+-
+-	cbo->dev = kdev;
+-	*cma_bo = cbo;
+-
+-	return ret;
+-
+-copy_fail:
+-	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem);
+-pdd_fail:
+-	if (cbo->sg) {
+-		kfd_put_sg_table(cbo->sg);
+-		sg_free_table(cbo->sg);
+-		kfree(cbo->sg);
+-	}
+-sg_fail:
+-	kfree(cbo);
+-	return ret;
+-}
+-
+-/* Update cma_iter.cur_bo with KFD BO that is assocaited with
+- * cma_iter.array.va_addr
+- */
+-static int kfd_cma_iter_update_bo(struct cma_iter *ci)
+-{
+-	struct kfd_memory_range *arr = ci->array;
+-	uint64_t va_end = arr->va_addr + arr->size - 1;
+-
+-	mutex_lock(&ci->p->mutex);
+-	ci->cur_bo = kfd_process_find_bo_from_interval(ci->p, arr->va_addr,
+-								va_end);
+-	mutex_unlock(&ci->p->mutex);
+-
+-	if (!ci->cur_bo || va_end > ci->cur_bo->it.last) {
+-		pr_err("CMA failed. Range out of bounds\n");
+-		return -EFAULT;
+-	}
+-	return 0;
+-}
+-
+-/* Advance iter by @size bytes. */
+-static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size)
+-{
+-	int ret = 0;
+-
+-	ci->offset += size;
+-	if (WARN_ON(size > ci->total || ci->offset > ci->array->size))
+-		return -EFAULT;
+-	ci->total -= size;
+-	/* If current range is copied, move to next range if available. */
+-	if (ci->offset == ci->array->size) {
+-
+-		/* End of all ranges */
+-		if (!(--ci->nr_segs))
+-			return 0;
+-
+-		ci->array++;
+-		ci->offset = 0;
+-		ret = kfd_cma_iter_update_bo(ci);
+-		if (ret)
+-			return ret;
+-	}
+-	ci->bo_offset = (ci->array->va_addr + ci->offset) -
+-			ci->cur_bo->it.start;
+-	return ret;
+-}
+-
+-static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
+-				struct kfd_process *p, struct mm_struct *mm,
+-				struct task_struct *task, struct cma_iter *ci)
+-{
+-	int ret;
+-	int nr;
+-
+-	if (!arr || !segs)
+-		return -EINVAL;
+-
+-	memset(ci, 0, sizeof(*ci));
+-	INIT_LIST_HEAD(&ci->cma_list);
+-	ci->array = arr;
+-	ci->nr_segs = segs;
+-	ci->p = p;
+-	ci->offset = 0;
+-	ci->mm = mm;
+-	ci->task = task;
+-	for (nr = 0; nr < segs; nr++)
+-		ci->total += arr[nr].size;
+-
+-	/* Valid but size is 0. So copied will also be 0 */
+-	if (!ci->total)
+-		return 0;
+-
+-	ret = kfd_cma_iter_update_bo(ci);
+-	if (!ret)
+-		ci->bo_offset = arr->va_addr - ci->cur_bo->it.start;
+-	return ret;
+-}
+-
+-static bool kfd_cma_iter_end(struct cma_iter *ci)
+-{
+-	if (!(ci->nr_segs) || !(ci->total))
+-		return true;
+-	return false;
+-}
+-
+-/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes
+- * both source and dest. BOs are userptr BOs. Both BOs can either belong to
+- * current process or one of the BOs can belong to a differnt
+- * process. @Returns 0 on success, -ve on failure
+- *
+- * @si: Source iter
+- * @di: Dest. iter
+- * @cma_write: Indicates if it is write to remote or read from remote
+- * @size: amount of bytes to be copied
+- * @copied: Return number of bytes actually copied.
+- */
+-static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di,
+-				bool cma_write, uint64_t size,
+-				uint64_t *copied)
+-{
+-	int i, ret = 0, locked;
+-	unsigned int nents, nl;
+-	unsigned int offset_in_page;
+-	struct page *pp_stack[MAX_PP_STACK_COUNT];
+-	struct page **process_pages = pp_stack;
+-	unsigned long rva, lva = 0, flags = 0;
+-	uint64_t copy_size, to_copy = size;
+-	struct cma_iter *li, *ri;
+-
+-	if (cma_write) {
+-		ri = di;
+-		li = si;
+-		flags |= FOLL_WRITE;
+-	} else {
+-		li = di;
+-		ri = si;
+-	}
+-	/* rva: remote virtual address. Page aligned to start page.
+-	 * rva + offset_in_page: Points to remote start address
+-	 * lva: local virtual address. Points to the start address.
+-	 * nents: computes number of remote pages to request
+-	 */
+-	offset_in_page = ri->bo_offset & (PAGE_SIZE - 1);
+-	rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK;
+-	lva = li->cur_bo->cpuva + li->bo_offset;
+-
+-	nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE;
+-
+-	copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page);
+-	*copied = 0;
+-
+-	if (nents > MAX_PP_STACK_COUNT) {
+-		/* For reliability kmalloc only 2 pages worth */
+-		process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES,
+-					      sizeof(struct pages *)*nents),
+-					GFP_KERNEL);
+-
+-		if (!process_pages)
+-			return -ENOMEM;
+-	}
+-
+-	while (nents && to_copy) {
+-		nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents);
+-		locked = 1;
+-		down_read(&ri->mm->mmap_sem);
+-		nl = get_user_pages_remote(ri->task, ri->mm, rva, nl,
+-					   flags, process_pages, NULL,
+-					   &locked);
+-		if (locked)
+-			up_read(&ri->mm->mmap_sem);
+-		if (nl <= 0) {
+-			pr_err("CMA: Invalid virtual address 0x%lx\n", rva);
+-			ret = -EFAULT;
+-			break;
+-		}
+-
+-		for (i = 0; i < nl; i++) {
+-			unsigned int n;
+-			void *kaddr = kmap(process_pages[i]);
+-
+-			if (cma_write) {
+-				n = copy_from_user(kaddr+offset_in_page,
+-						   (void *)lva, copy_size);
+-				set_page_dirty(process_pages[i]);
+-			} else {
+-				n = copy_to_user((void *)lva,
+-						 kaddr+offset_in_page,
+-						 copy_size);
+-			}
+-			kunmap(kaddr);
+-			if (n) {
+-				ret = -EFAULT;
+-				break;
+-			}
+-			to_copy -= copy_size;
+-			if (!to_copy)
+-				break;
+-			lva += copy_size;
+-			rva += (copy_size + offset_in_page);
+-			WARN_ONCE(rva & (PAGE_SIZE - 1),
+-				  "CMA: Error in remote VA computation");
+-			offset_in_page = 0;
+-			copy_size = min_t(uint64_t, to_copy, PAGE_SIZE);
+-		}
+-
+-		for (i = 0; i < nl; i++)
+-			put_page(process_pages[i]);
+-
+-		if (ret)
+-			break;
+-		nents -= nl;
+-	}
+-
+-	if (process_pages != pp_stack)
+-		kfree(process_pages);
+-
+-	*copied = (size - to_copy);
+-	return ret;
+-
+-}
+-
+-/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their
+- * respective offset.
+- * @si: Source iter
+- * @di: Dest. iter
+- * @cma_write: Indicates if it is write to remote or read from remote
+- * @size: amount of bytes to be copied
+- * @f: Return the last fence if any
+- * @copied: Return number of bytes actually copied.
+- */
+-static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+-			int cma_write, uint64_t size,
+-			struct dma_fence **f, uint64_t *copied)
+-{
+-	int err = 0;
+-	struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo;
+-	uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
+-	struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
+-	struct kfd_dev *dev = dst_bo->dev;
+-	struct cma_system_bo *tmp_bo = NULL;
+-
+-	*copied = 0;
+-	if (f)
+-		*f = NULL;
+-	if (src_bo->cpuva && dst_bo->cpuva)
+-		return kfd_copy_userptr_bos(si, di, cma_write, size, copied);
+-
+-	/* If either source or dest. is userptr, create a shadow system BO
+-	 * by using the underlying userptr BO pages. Then use this shadow
+-	 * BO for copy. src_offset & dst_offset are adjusted because the new BO
+-	 * is only created for the window (offset, size) requested.
+-	 * The shadow BO is created on the other device. This means if the
+-	 * other BO is a device memory, the copy will be using that device.
+-	 * The BOs are stored in cma_list for deferred cleanup. This minimizes
+-	 * fence waiting just to the last fence.
+-	 */
+-	if (src_bo->cpuva) {
+-		dev = dst_bo->dev;
+-		err = kfd_create_cma_system_bo(dev, src_bo, &size,
+-					       si->bo_offset, cma_write,
+-					       si->p, si->mm, si->task,
+-					       &si->cma_bo);
+-		src_mem = si->cma_bo->mem;
+-		src_offset = si->bo_offset & (PAGE_SIZE - 1);
+-		list_add_tail(&si->cma_bo->list, &si->cma_list);
+-	} else if (dst_bo->cpuva) {
+-		dev = src_bo->dev;
+-		err = kfd_create_cma_system_bo(dev, dst_bo, &size,
+-					       di->bo_offset, cma_write,
+-					       di->p, di->mm, di->task,
+-					       &di->cma_bo);
+-		dst_mem = di->cma_bo->mem;
+-		dst_offset = di->bo_offset & (PAGE_SIZE - 1);
+-		list_add_tail(&di->cma_bo->list, &di->cma_list);
+-	} else if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+-		/* This indicates that atleast on of the BO is in local mem.
+-		 * If both are in local mem of different devices then create an
+-		 * intermediate System BO and do a double copy
+-		 * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM].
+-		 * If only one BO is in VRAM then use that GPU to do the copy
+-		 */
+-		if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM &&
+-		    dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+-			dev = dst_bo->dev;
+-			err = kfd_create_cma_system_bo(src_bo->dev, src_bo,
+-						       &size, si->bo_offset,
+-						       cma_write, si->p,
+-						       si->mm, si->task,
+-						       &tmp_bo);
+-			src_mem = tmp_bo->mem;
+-			src_offset = 0;
+-		} else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+-			dev = src_bo->dev;
+-		/* else already set to dst_bo->dev */
+-	}
+-
+-	if (err) {
+-		pr_err("Failed to create system BO %d", err);
+-		return -EINVAL;
+-	}
+-
+-	err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset,
+-					    dst_mem, dst_offset, size, f,
+-					    copied);
+-	/* The tmp_bo allocates additional memory. So it is better to wait and
+-	 * delete. Also since multiple GPUs are involved the copies are
+-	 * currently not pipelined.
+-	 */
+-	if (tmp_bo) {
+-		if (!err) {
+-			kfd_cma_fence_wait(*f);
+-			dma_fence_put(*f);
+-			*f = NULL;
+-		}
+-		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem);
+-		kfree(tmp_bo);
+-	}
+-	return err;
+-}
+-
+-/* Copy single range from source iterator @si to destination iterator @di.
+- * @si will move to next range and @di will move by bytes copied.
+- * @return : 0 for success or -ve for failure
+- * @f: The last fence if any
+- * @copied: out: number of bytes copied
+- */
+-static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di,
+-				bool cma_write, struct dma_fence **f,
+-				uint64_t *copied)
+-{
+-	int err = 0;
+-	uint64_t copy_size, n;
+-	uint64_t size = si->array->size;
+-	struct kfd_bo *src_bo = si->cur_bo;
+-	struct dma_fence *lfence = NULL;
+-
+-	if (!src_bo || !di || !copied)
+-		return -EINVAL;
+-	*copied = 0;
+-	if (f)
+-		*f = NULL;
+-
+-	while (size && !kfd_cma_iter_end(di)) {
+-		struct dma_fence *fence = NULL;
+-
+-		copy_size = min(size, (di->array->size - di->offset));
+-
+-		err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n);
+-		if (err) {
+-			pr_err("CMA %d failed\n", err);
+-			break;
+-		}
+-
+-		if (fence) {
+-			err = kfd_fence_put_wait_if_diff_context(fence,
+-								 lfence);
+-			lfence = fence;
+-			if (err)
+-				break;
+-		}
+-
+-		size -= n;
+-		*copied += n;
+-		err = kfd_cma_iter_advance(si, n);
+-		if (err)
+-			break;
+-		err = kfd_cma_iter_advance(di, n);
+-		if (err)
+-			break;
+-	}
+-
+-	if (f)
+-		*f = dma_fence_get(lfence);
+-	dma_fence_put(lfence);
+-
+-	return err;
+-}
+-
+ static int kfd_ioctl_cross_memory_copy(struct file *filep,
+ 				       struct kfd_process *local_p, void *data)
+ {
+ 	struct kfd_ioctl_cross_memory_copy_args *args = data;
+ 	struct kfd_memory_range *src_array, *dst_array;
+-	struct kfd_process *remote_p;
++	struct kfd_bo *src_bo, *dst_bo;
++	struct kfd_process *remote_p, *src_p, *dst_p;
+ 	struct task_struct *remote_task;
+ 	struct mm_struct *remote_mm;
+ 	struct pid *remote_pid;
+-	struct dma_fence *lfence = NULL;
+-	uint64_t copied = 0, total_copied = 0;
+-	struct cma_iter di, si;
++	struct dma_fence *fence = NULL, *lfence = NULL;
++	uint64_t dst_va_addr;
++	uint64_t copied, total_copied = 0;
++	uint64_t src_offset, dst_offset, dst_va_addr_end;
+ 	const char *cma_op;
+-	int err = 0;
++	int i, j = 0, err = 0;
+ 
+ 	/* Check parameters */
+ 	if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 ||
+@@ -2372,76 +1761,169 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep,
+ 	}
+ 
+ 	remote_p = kfd_get_process(remote_task);
+-	if (IS_ERR(remote_p)) {
++	if (!remote_p) {
+ 		pr_err("Cross mem copy failed. Invalid kfd process %d\n",
+ 		       args->pid);
+ 		err = -EINVAL;
+ 		goto kfd_process_fail;
+ 	}
+-	/* Initialise cma_iter si & @di with source & destination range. */
++
+ 	if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) {
++		src_p = local_p;
++		dst_p = remote_p;
+ 		cma_op = "WRITE";
+ 		pr_debug("CMA WRITE: local -> remote\n");
+-		err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
+-					remote_p, remote_mm, remote_task, &di);
+-		if (err)
+-			goto kfd_process_fail;
+-		err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
+-					local_p, current->mm, current, &si);
+-		if (err)
+-			goto kfd_process_fail;
+ 	} else {
++		src_p = remote_p;
++		dst_p = local_p;
+ 		cma_op = "READ";
+ 		pr_debug("CMA READ: remote -> local\n");
+-
+-		err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
+-					local_p, current->mm, current, &di);
+-		if (err)
+-			goto kfd_process_fail;
+-		err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
+-					remote_p, remote_mm, remote_task, &si);
+-		if (err)
+-			goto kfd_process_fail;
+ 	}
+ 
+-	/* Copy one si range at a time into di. After each call to
+-	 * kfd_copy_single_range() si will move to next range. di will be
+-	 * incremented by bytes copied
+-	 */
+-	while (!kfd_cma_iter_end(&si) && !kfd_cma_iter_end(&di)) {
+-		struct dma_fence *fence = NULL;
+-
+-		err = kfd_copy_single_range(&si, &di,
+-					KFD_IS_CROSS_MEMORY_WRITE(args->flags),
+-					&fence, &copied);
+-		total_copied += copied;
+ 
+-		if (err)
++	/* For each source kfd_range:
++	 * - Find the BO. Each range has to be within the same BO.
++	 * - Copy this range to single or multiple destination BOs.
++	 * - dst_va_addr - will point to next va address into which data will
++	 *                 be copied.
++	 * - dst_bo & src_bo - the current destination and source BOs
++	 * - src_offset & dst_offset - offset into the respective BOs from
++	 *                             data will be sourced or copied
++	 */
++	dst_va_addr = dst_array[0].va_addr;
++	dst_va_addr_end = dst_va_addr + dst_array[0].size - 1;
++	mutex_lock(&dst_p->mutex);
++	dst_bo = kfd_process_find_bo_from_interval(dst_p,
++			dst_va_addr,
++			dst_va_addr_end);
++	mutex_unlock(&dst_p->mutex);
++	if (!dst_bo || dst_va_addr_end > dst_bo->it.last) {
++		pr_err("CMA %s failed. Invalid dst range\n", cma_op);
++		err = -EFAULT;
++		goto kfd_process_fail;
++	}
++	dst_offset = dst_va_addr - dst_bo->it.start;
++
++	for (i = 0; i < args->src_mem_array_size; i++) {
++		uint64_t src_va_addr_end = src_array[i].va_addr +
++					   src_array[i].size - 1;
++		uint64_t src_size_to_copy = src_array[i].size;
++
++		mutex_lock(&src_p->mutex);
++		src_bo = kfd_process_find_bo_from_interval(src_p,
++				src_array[i].va_addr,
++				src_va_addr_end);
++		mutex_unlock(&src_p->mutex);
++		if (!src_bo || src_va_addr_end > src_bo->it.last) {
++			pr_err("CMA %s failed. Invalid src range\n", cma_op);
++			err = -EFAULT;
+ 			break;
++		}
++
++		src_offset = src_array[i].va_addr - src_bo->it.start;
+ 
+-		/* Release old fence if a later fence is created. If no
+-		 * new fence is created, then keep the preivous fence
++		/* Copy src_bo to one or multiple dst_bo(s) based on size and
++		 * and current copy location.
+ 		 */
+-		if (fence) {
+-			err = kfd_fence_put_wait_if_diff_context(fence,
+-								 lfence);
++		while (j < args->dst_mem_array_size) {
++			uint64_t copy_size;
++			int64_t space_left;
++
++			/* Find the current copy_size. This will be smaller of
++			 * the following
++			 * - space left in the current dest memory range
++			 * - data left to copy from source range
++			 */
++			space_left = (dst_array[j].va_addr + dst_array[j].size)
++					- dst_va_addr;
++			copy_size = (src_size_to_copy < space_left) ?
++					src_size_to_copy : space_left;
++
++			/* Check both BOs belong to same device */
++			if (src_bo->dev->kgd != dst_bo->dev->kgd) {
++				pr_err("CMA %s fail. Not same dev\n", cma_op);
++				err = -EINVAL;
++				break;
++			}
++
++			/* Store prev fence. Release it when a later fence is
++			 * created
++			 */
+ 			lfence = fence;
+-			if (err)
++			fence = NULL;
++
++			err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(
++				src_bo->dev->kgd,
++				src_bo->mem, src_offset,
++				dst_bo->mem, dst_offset,
++				copy_size,
++				&fence, &copied);
++
++			if (err) {
++				pr_err("GPU CMA %s failed\n", cma_op);
++				break;
++			}
++
++			/* Later fence available. Release old fence */
++			if (fence && lfence) {
++				dma_fence_put(lfence);
++				lfence = NULL;
++			}
++
++			total_copied += copied;
++			src_size_to_copy -= copied;
++			space_left -= copied;
++			dst_va_addr += copied;
++			dst_offset += copied;
++			src_offset += copied;
++			if (dst_va_addr > dst_bo->it.last + 1) {
++				pr_err("CMA %s fail. Mem overflow\n", cma_op);
++				err = -EFAULT;
++				break;
++			}
++
++			/* If the cur dest range is full move to next one */
++			if (space_left <= 0) {
++				if (++j >= args->dst_mem_array_size)
++					break;
++
++				dst_va_addr = dst_array[j].va_addr;
++				dst_va_addr_end = dst_va_addr +
++						  dst_array[j].size - 1;
++				dst_bo = kfd_process_find_bo_from_interval(
++						dst_p,
++						dst_va_addr,
++						dst_va_addr_end);
++				if (!dst_bo ||
++				    dst_va_addr_end > dst_bo->it.last) {
++					pr_err("CMA %s failed. Invalid dst range\n",
++					       cma_op);
++					err = -EFAULT;
++					break;
++				}
++				dst_offset = dst_va_addr - dst_bo->it.start;
++			}
++
++			/* If the cur src range is done, move to next one */
++			if (src_size_to_copy <= 0)
+ 				break;
+ 		}
++		if (err)
++			break;
+ 	}
+ 
+ 	/* Wait for the last fence irrespective of error condition */
+-	if (lfence) {
+-		err = kfd_cma_fence_wait(lfence);
+-		dma_fence_put(lfence);
+-		if (err)
++	if (fence) {
++		if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000))
++			< 0)
+ 			pr_err("CMA %s failed. BO timed out\n", cma_op);
++		dma_fence_put(fence);
++	} else if (lfence) {
++		pr_debug("GPU copy fail. But wait for prev DMA to finish\n");
++		dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000));
++		dma_fence_put(lfence);
+ 	}
+ 
+-	kfd_free_cma_bos(&si);
+-	kfd_free_cma_bos(&di);
+-
+ kfd_process_fail:
+ 	mmput(remote_mm);
+ mm_access_fail:
+@@ -2530,21 +2012,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+ 			kfd_ioctl_dbg_wave_control, 0),
+ 
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA,
+-			kfd_ioctl_set_scratch_backing_va, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
+-			kfd_ioctl_get_tile_config, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+-			kfd_ioctl_set_trap_handler, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+-			kfd_ioctl_get_process_apertures_new, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM,
+-			kfd_ioctl_acquire_vm, 0),
+-
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU,
+ 			kfd_ioctl_alloc_memory_of_gpu, 0),
+ 
+@@ -2557,15 +2024,30 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU,
+ 			kfd_ioctl_unmap_memory_from_gpu, 0),
+ 
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH,
++			kfd_ioctl_alloc_scratch_memory, 0),
++
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
+ 			kfd_ioctl_set_cu_mask, 0),
+ 
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE,
++			kfd_ioctl_set_process_dgpu_aperture, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
++			kfd_ioctl_set_trap_handler, 0),
++
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
++				kfd_ioctl_get_process_apertures_new, 0),
++
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO,
+ 				kfd_ioctl_get_dmabuf_info, 0),
+ 
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
+ 				kfd_ioctl_import_dmabuf, 0),
+ 
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
++				kfd_ioctl_get_tile_config, 0),
++
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE,
+ 				kfd_ioctl_ipc_import_handle, 0),
+ 
+@@ -2578,6 +2060,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE,
+ 				kfd_ioctl_get_queue_wave_state, 0),
+ 
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM,
++				kfd_ioctl_acquire_vm, 0)
++
+ };
+ 
+ #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
+@@ -2673,33 +2158,34 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
+ {
+ 	struct kfd_process *process;
+-	struct kfd_dev *dev = NULL;
++	struct kfd_dev *kfd;
+ 	unsigned long vm_pgoff;
+-	unsigned int gpu_id;
++	unsigned long long mmap_type;
+ 
+ 	process = kfd_get_process(current);
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+ 	vm_pgoff = vma->vm_pgoff;
+-	vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+-	gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff);
+-	if (gpu_id)
+-		dev = kfd_device_by_id(gpu_id);
++	vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff);
++	mmap_type = vm_pgoff & KFD_MMAP_TYPE_MASK;
+ 
+-	switch (vm_pgoff & KFD_MMAP_TYPE_MASK) {
++	switch (mmap_type) {
+ 	case KFD_MMAP_TYPE_DOORBELL:
+-		if (!dev)
+-			return -ENODEV;
+-		return kfd_doorbell_mmap(dev, process, vma);
++		kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff));
++		if (!kfd)
++			return -EFAULT;
++		return kfd_doorbell_mmap(kfd, process, vma);
+ 
+ 	case KFD_MMAP_TYPE_EVENTS:
+ 		return kfd_event_mmap(process, vma);
+ 
+ 	case KFD_MMAP_TYPE_RESERVED_MEM:
+-		if (!dev)
+-			return -ENODEV;
+-		return kfd_reserved_mem_mmap(dev, process, vma);
++		return kfd_reserved_mem_mmap(process, vma);
++
++	default:
++		pr_err("Unsupported kfd mmap type %llx\n", mmap_type);
++		break;
+ 	}
+ 
+ 	return -EFAULT;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+index c540b65..24d0634 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+@@ -1,27 +1,7 @@
+-/*
+- * Copyright 2015-2017 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#include <linux/pci.h>
++#include <linux/kernel.h>
+ #include <linux/acpi.h>
++#include <linux/mm.h>
++#include <linux/pci.h>
+ #include "kfd_crat.h"
+ #include "kfd_priv.h"
+ #include "kfd_topology.h"
+@@ -286,7 +266,6 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
+ 
+ 	id = cache->processor_id_low;
+ 
+-	pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
+ 	list_for_each_entry(dev, device_list, list) {
+ 		total_num_of_cu = (dev->node_props.array_count *
+ 					dev->node_props.cu_per_simd_array);
+@@ -436,15 +415,11 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+ 		ret = kfd_parse_subtype_cache(cache, device_list);
+ 		break;
+ 	case CRAT_SUBTYPE_TLB_AFFINITY:
+-		/*
+-		 * For now, nothing to do here
+-		 */
++		/* For now, nothing to do here */
+ 		pr_debug("Found TLB entry in CRAT table (not processing)\n");
+ 		break;
+ 	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+-		/*
+-		 * For now, nothing to do here
+-		 */
++		/* For now, nothing to do here */
+ 		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
+ 		break;
+ 	case CRAT_SUBTYPE_IOLINK_AFFINITY:
+@@ -469,8 +444,9 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+  *
+  *	Return - 0 if successful else -ve value
+  */
+-int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+-			 uint32_t proximity_domain)
++int kfd_parse_crat_table(void *crat_image,
++				struct list_head *device_list,
++				uint32_t proximity_domain)
+ {
+ 	struct kfd_topology_device *top_dev = NULL;
+ 	struct crat_subtype_generic *sub_type_hdr;
+@@ -642,7 +618,6 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+ 		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
+ 		break;
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 		pcache_info = vega10_cache_info;
+ 		num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+ 		break;
+@@ -718,7 +693,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+  *		     crat_image will be NULL
+  *	@size: [OUT] size of crat_image
+  *
+- *	Return 0 if successful else return error code
++ *	Return 0 if successful else return -ve value
+  */
+ #ifdef CONFIG_ACPI
+ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+@@ -750,8 +725,10 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+ 	}
+ 
+ 	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
+-	if (!pcrat_image)
++	if (!pcrat_image) {
++		pr_err("No memory for allocating CRAT image\n");
+ 		return -ENOMEM;
++	}
+ 
+ 	memcpy(pcrat_image, crat_table, crat_table->length);
+ 
+@@ -938,7 +915,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
+ 
+ #ifdef CONFIG_ACPI
+ 	status = acpi_get_table("DSDT", 0, &acpi_table);
+-	if (status != AE_OK)
++	if (status == AE_NOT_FOUND)
+ 		pr_warn("DSDT table not found for OEM information\n");
+ 	else {
+ 		crat_table->oem_revision = acpi_table->revision;
+@@ -1095,8 +1072,8 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size,
+  *		[OUT] actual size of data filled in crat_image
+  */
+ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+-				      size_t *size, struct kfd_dev *kdev,
+-				      uint32_t proximity_domain)
++			size_t *size, struct kfd_dev *kdev,
++			uint32_t proximity_domain)
+ {
+ 	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+ 	struct crat_subtype_generic *sub_type_hdr;
+@@ -1264,8 +1241,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+  *	Return 0 if successful else return -ve value
+  */
+ int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+-				  int flags, struct kfd_dev *kdev,
+-				  uint32_t proximity_domain)
++		int flags, struct kfd_dev *kdev, uint32_t proximity_domain)
+ {
+ 	void *pcrat_image = NULL;
+ 	int ret = 0;
+@@ -1295,8 +1271,8 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+ 		if (!pcrat_image)
+ 			return -ENOMEM;
+ 		*size = VCRAT_SIZE_FOR_GPU;
+-		ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
+-						 proximity_domain);
++		ret = kfd_create_vcrat_image_gpu(pcrat_image, size,
++				kdev, proximity_domain);
+ 		break;
+ 	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
+ 		/* TODO: */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+index cd7ee6d..00de41f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+@@ -24,6 +24,7 @@
+ #define KFD_CRAT_H_INCLUDED
+ 
+ #include <linux/types.h>
++#include "kfd_priv.h"
+ 
+ #pragma pack(1)
+ 
+@@ -227,12 +228,12 @@ struct crat_subtype_ccompute {
+ /*
+  * HSA IO Link Affinity structure and definitions
+  */
+-#define CRAT_IOLINK_FLAGS_ENABLED		(1 << 0)
+-#define CRAT_IOLINK_FLAGS_NON_COHERENT		(1 << 1)
+-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT	(1 << 2)
+-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT	(1 << 3)
+-#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA	(1 << 4)
+-#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0xffffffe0
++#define CRAT_IOLINK_FLAGS_ENABLED	(1 << 0)
++#define CRAT_IOLINK_FLAGS_NON_COHERENT	(1 << 1)
++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2)
++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3)
++#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4)
++#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0
+ 
+ /*
+  * IO interface types
+@@ -240,18 +241,18 @@ struct crat_subtype_ccompute {
+ #define CRAT_IOLINK_TYPE_UNDEFINED	0
+ #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
+ #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
+-#define CRAT_IOLINK_TYPE_AMBA		3
+-#define CRAT_IOLINK_TYPE_MIPI		4
+-#define CRAT_IOLINK_TYPE_QPI_1_1	5
+-#define CRAT_IOLINK_TYPE_RESERVED1	6
+-#define CRAT_IOLINK_TYPE_RESERVED2	7
+-#define CRAT_IOLINK_TYPE_RAPID_IO	8
+-#define CRAT_IOLINK_TYPE_INFINIBAND	9
+-#define CRAT_IOLINK_TYPE_RESERVED3	10
+-#define CRAT_IOLINK_TYPE_OTHER		11
+-#define CRAT_IOLINK_TYPE_MAX		255
+-
+-#define CRAT_IOLINK_RESERVED_LENGTH	24
++#define CRAT_IOLINK_TYPE_AMBA 3
++#define CRAT_IOLINK_TYPE_MIPI 4
++#define CRAT_IOLINK_TYPE_QPI_1_1 5
++#define CRAT_IOLINK_TYPE_RESERVED1 6
++#define CRAT_IOLINK_TYPE_RESERVED2 7
++#define CRAT_IOLINK_TYPE_RAPID_IO 8
++#define CRAT_IOLINK_TYPE_INFINIBAND 9
++#define CRAT_IOLINK_TYPE_RESERVED3 10
++#define CRAT_IOLINK_TYPE_OTHER 11
++#define CRAT_IOLINK_TYPE_MAX 255
++
++#define CRAT_IOLINK_RESERVED_LENGTH 24
+ 
+ struct crat_subtype_iolink {
+ 	uint8_t		type;
+@@ -307,16 +308,13 @@ struct cdit_header {
+ 
+ #pragma pack()
+ 
+-struct kfd_dev;
+-
+ #ifdef CONFIG_ACPI
+ int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+ #endif
+ void kfd_destroy_crat_image(void *crat_image);
+-int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+-			 uint32_t proximity_domain);
++int kfd_parse_crat_table(void *crat_image,
++		struct list_head *device_list,
++		uint32_t proximity_domain);
+ int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+-				  int flags, struct kfd_dev *kdev,
+-				  uint32_t proximity_domain);
+-
++		int flags, struct kfd_dev *kdev, uint32_t proximity_domain);
+ #endif /* KFD_CRAT_H_INCLUDED */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+index ab37d36..232e28f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2017 Advanced Micro Devices, Inc.
++ * Copyright 2014 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -21,8 +21,6 @@
+  */
+ 
+ #include <linux/debugfs.h>
+-#include <linux/uaccess.h>
+-
+ #include "kfd_priv.h"
+ 
+ static struct dentry *debugfs_root;
+@@ -34,38 +32,6 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file)
+ 	return single_open(file, show, NULL);
+ }
+ 
+-static ssize_t kfd_debugfs_hang_hws_write(struct file *file,
+-	const char __user *user_buf, size_t size, loff_t *ppos)
+-{
+-	struct kfd_dev *dev;
+-	char tmp[16];
+-	uint32_t gpu_id;
+-	int ret = -EINVAL;
+-
+-	memset(tmp, 0, 16);
+-	if (size >= 16) {
+-		pr_err("Invalid input for gpu id.\n");
+-		goto out;
+-	}
+-	if (copy_from_user(tmp, user_buf, size)) {
+-		ret = -EFAULT;
+-		goto out;
+-	}
+-	if (kstrtoint(tmp, 10, &gpu_id)) {
+-		pr_err("Invalid input for gpu id.\n");
+-		goto out;
+-	}
+-	dev = kfd_device_by_id(gpu_id);
+-	if (dev) {
+-		kfd_debugfs_hang_hws(dev);
+-		ret = size;
+-	} else
+-		pr_err("Cannot find device %d.\n", gpu_id);
+-
+-out:
+-	return ret;
+-}
+-
+ static const struct file_operations kfd_debugfs_fops = {
+ 	.owner = THIS_MODULE,
+ 	.open = kfd_debugfs_open,
+@@ -74,15 +40,6 @@ static const struct file_operations kfd_debugfs_fops = {
+ 	.release = single_release,
+ };
+ 
+-static const struct file_operations kfd_debugfs_hang_hws_fops = {
+-	.owner = THIS_MODULE,
+-	.open = kfd_debugfs_open,
+-	.read = seq_read,
+-	.write = kfd_debugfs_hang_hws_write,
+-	.llseek = seq_lseek,
+-	.release = single_release,
+-};
+-
+ void kfd_debugfs_init(void)
+ {
+ 	struct dentry *ent;
+@@ -108,11 +65,6 @@ void kfd_debugfs_init(void)
+ 	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+ 				  kfd_debugfs_rls_by_device,
+ 				  &kfd_debugfs_fops);
+-
+-	ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+-				  NULL,
+-				  &kfd_debugfs_hang_hws_fops);
+-
+ 	if (!ent)
+ 		pr_warn("Failed to create rls in kfd debugfs\n");
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+old mode 100644
+new mode 100755
+index 10095087..a9ad2a8
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -27,17 +27,12 @@
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_pm4_headers_vi.h"
+-#include "cwsr_trap_handler.h"
++#include "cwsr_trap_handler_gfx8.asm"
++#include "cwsr_trap_handler_gfx9.asm"
+ #include "kfd_iommu.h"
+ 
+ #define MQD_SIZE_ALIGNED 768
+-
+-/*
+- * kfd_locked is used to lock the kfd driver during suspend or reset
+- * once locked, kfd driver will stop any further GPU execution.
+- * create process (open) will return -EAGAIN.
+- */
+-static atomic_t kfd_locked = ATOMIC_INIT(0);
++static atomic_t kfd_device_suspended = ATOMIC_INIT(0);
+ 
+ #ifdef KFD_SUPPORT_IOMMU_V2
+ static const struct kfd_device_info kaveri_device_info = {
+@@ -54,7 +49,6 @@ static const struct kfd_device_info kaveri_device_info = {
+ 	.needs_iommu_device = true,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info carrizo_device_info = {
+@@ -71,7 +65,6 @@ static const struct kfd_device_info carrizo_device_info = {
+ 	.needs_iommu_device = true,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info raven_device_info = {
+@@ -87,7 +80,6 @@ static const struct kfd_device_info raven_device_info = {
+ 	.needs_iommu_device = true,
+ 	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 1,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ #endif
+ 
+@@ -105,7 +97,6 @@ static const struct kfd_device_info hawaii_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info tonga_device_info = {
+@@ -121,7 +112,6 @@ static const struct kfd_device_info tonga_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info tonga_vf_device_info = {
+@@ -137,7 +127,6 @@ static const struct kfd_device_info tonga_vf_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info fiji_device_info = {
+@@ -153,7 +142,6 @@ static const struct kfd_device_info fiji_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info fiji_vf_device_info = {
+@@ -169,7 +157,6 @@ static const struct kfd_device_info fiji_vf_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ 
+@@ -186,7 +173,6 @@ static const struct kfd_device_info polaris10_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info polaris10_vf_device_info = {
+@@ -202,7 +188,6 @@ static const struct kfd_device_info polaris10_vf_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info polaris11_device_info = {
+@@ -218,7 +203,6 @@ static const struct kfd_device_info polaris11_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info vega10_device_info = {
+@@ -232,9 +216,8 @@ static const struct kfd_device_info vega10_device_info = {
+ 	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+ 	.supports_cwsr = true,
+ 	.needs_iommu_device = false,
+-	.needs_pci_atomics = false,
++	.needs_pci_atomics = true,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+ };
+ 
+ static const struct kfd_device_info vega10_vf_device_info = {
+@@ -250,23 +233,6 @@ static const struct kfd_device_info vega10_vf_device_info = {
+ 	.needs_iommu_device = false,
+ 	.needs_pci_atomics = false,
+ 	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 2,
+-};
+-
+-static const struct kfd_device_info vega20_device_info = {
+-	.asic_family = CHIP_VEGA20,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd	= 24,
+-	.doorbell_size	= 8,
+-	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_v9,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.supports_cwsr = true,
+-	.needs_iommu_device = false,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-	.num_sdma_queues_per_engine = 8,
+ };
+ 
+ struct kfd_deviceid {
+@@ -317,35 +283,35 @@ static const struct kfd_deviceid supported_devices[] = {
+ 	{ 0x67B9, &hawaii_device_info },	/* Hawaii */
+ 	{ 0x67BA, &hawaii_device_info },	/* Hawaii */
+ 	{ 0x67BE, &hawaii_device_info },	/* Hawaii */
+-	{ 0x6920, &tonga_device_info },		/* Tonga */
+-	{ 0x6921, &tonga_device_info },		/* Tonga */
+-	{ 0x6928, &tonga_device_info },		/* Tonga */
+-	{ 0x6929, &tonga_device_info },		/* Tonga */
+-	{ 0x692B, &tonga_device_info },		/* Tonga */
+-	{ 0x692F, &tonga_vf_device_info },	/* Tonga vf */
+-	{ 0x6938, &tonga_device_info },		/* Tonga */
+-	{ 0x6939, &tonga_device_info },		/* Tonga */
+-	{ 0x7300, &fiji_device_info },		/* Fiji */
+-	{ 0x730F, &fiji_vf_device_info },	/* Fiji vf*/
+-	{ 0x67C0, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C1, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C2, &polaris10_device_info },	/* Polaris10 */
++	{ 0x6920, &tonga_device_info   },	/* Tonga */
++	{ 0x6921, &tonga_device_info   },	/* Tonga */
++	{ 0x6928, &tonga_device_info   },	/* Tonga */
++	{ 0x6929, &tonga_device_info   },	/* Tonga */
++	{ 0x692B, &tonga_device_info   },	/* Tonga */
++	{ 0x692F, &tonga_vf_device_info   },	/* Tonga vf */
++	{ 0x6938, &tonga_device_info   },	/* Tonga */
++	{ 0x6939, &tonga_device_info   },	/* Tonga */
++	{ 0x7300, &fiji_device_info    },	/* Fiji */
++	{ 0x730F, &fiji_vf_device_info    },	/* Fiji vf*/
++	{ 0x67C0, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67C1, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67C2, &polaris10_device_info },     /* Polaris10 */
+ 	{ 0x67C4, &polaris10_device_info },	/* Polaris10 */
+ 	{ 0x67C7, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C8, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C9, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67CA, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67CC, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67CF, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67D0, &polaris10_vf_device_info },	/* Polaris10 vf*/
++	{ 0x67C8, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67C9, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67CA, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67CC, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67CF, &polaris10_device_info },     /* Polaris10 */
++	{ 0x67D0, &polaris10_vf_device_info },     /* Polaris10 vf*/
+ 	{ 0x67DF, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67E0, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67E1, &polaris11_device_info },	/* Polaris11 */
++	{ 0x67E0, &polaris11_device_info },     /* Polaris11 */
++	{ 0x67E1, &polaris11_device_info },     /* Polaris11 */
+ 	{ 0x67E3, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67E7, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67E8, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67E9, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67EB, &polaris11_device_info },	/* Polaris11 */
++	{ 0x67E7, &polaris11_device_info },     /* Polaris11 */
++	{ 0x67E8, &polaris11_device_info },     /* Polaris11 */
++	{ 0x67E9, &polaris11_device_info },     /* Polaris11 */
++	{ 0x67EB, &polaris11_device_info },     /* Polaris11 */
+ 	{ 0x67EF, &polaris11_device_info },	/* Polaris11 */
+ 	{ 0x67FF, &polaris11_device_info },	/* Polaris11 */
+ 	{ 0x6860, &vega10_device_info },	/* Vega10 */
+@@ -357,12 +323,6 @@ static const struct kfd_deviceid supported_devices[] = {
+ 	{ 0x6868, &vega10_device_info },	/* Vega10 */
+ 	{ 0x686C, &vega10_vf_device_info },	/* Vega10  vf*/
+ 	{ 0x687F, &vega10_device_info },	/* Vega10 */
+-	{ 0x66a0, &vega20_device_info },	/* Vega20 */
+-	{ 0x66a1, &vega20_device_info },	/* Vega20 */
+-	{ 0x66a2, &vega20_device_info },	/* Vega20 */
+-	{ 0x66a3, &vega20_device_info },	/* Vega20 */
+-	{ 0x66a7, &vega20_device_info },	/* Vega20 */
+-	{ 0x66af, &vega20_device_info }		/* Vega20 */
+ };
+ 
+ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+@@ -392,7 +352,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+ 	struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
+ {
+ 	struct kfd_dev *kfd;
+-	int ret;
++
+ 	const struct kfd_device_info *device_info =
+ 					lookup_device_info(pdev->device);
+ 
+@@ -400,27 +360,24 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+ 		dev_err(kfd_device, "kgd2kfd_probe failed\n");
+ 		return NULL;
+ 	}
+-	
++
++	if (device_info->needs_pci_atomics) {
++		/* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
++		 * 32 and 64-bit requests are possible and must be
++		 * supported.
++		 */
++                if (pci_enable_atomic_ops_to_root(pdev) < 0) {
++                        dev_info(kfd_device,
++                                "skipped device %x:%x, PCI rejects atomics",
++                                 pdev->vendor, pdev->device);
++                        return NULL;
++                }
++	}
++
+ 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+ 	if (!kfd)
+ 		return NULL;
+ 
+-	/* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
+-	* 32 and 64-bit requests are possible and must be
+-	* supported.
+-	*/
+-	ret = pci_enable_atomic_ops_to_root(pdev,
+-			PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+-			PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+-	if (device_info->needs_pci_atomics && ret  < 0) {
+-		dev_info(kfd_device,
+-			"skipped device %x:%x, PCI rejects atomics",
+-			pdev->vendor, pdev->device);
+-		kfree(kfd);
+-		return NULL;
+-	} else if (!ret)
+-		kfd->pci_atomic_requested = true;
+-
+ 	kfd->kgd = kgd;
+ 	kfd->device_info = device_info;
+ 	kfd->pdev = pdev;
+@@ -462,6 +419,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 			KGD_ENGINE_SDMA1);
+ 	kfd->shared_resources = *gpu_resources;
+ 
++	/* Usually first_vmid_kfd = 8, last_vmid_kfd = 15 */
+ 	kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1;
+ 	kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1;
+ 	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+@@ -498,8 +456,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 
+ 	if (kfd->kfd2kgd->init_gtt_mem_allocation(
+ 			kfd->kgd, size, &kfd->gtt_mem,
+-			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
+-			false)) {
++			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){
+ 		dev_err(kfd_device, "Could not allocate %d bytes\n", size);
+ 		goto out;
+ 	}
+@@ -592,52 +549,21 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
+ 
+ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+ {
+-	if (!kfd->init_complete)
+-		return 0;
+-	kgd2kfd_suspend(kfd);
+-
+-	/* hold dqm->lock to prevent further execution*/
+-	mutex_lock(&kfd->dqm->lock);
+-
+-	kfd_signal_reset_event(kfd);
+ 	return 0;
+ }
+ 
+-/*
+- * Fix me. KFD won't be able to resume existing process for now.
+- * We will keep all existing process in a evicted state and
+- * wait the process to be terminated.
+- */
+-
+ int kgd2kfd_post_reset(struct kfd_dev *kfd)
+ {
+-	int ret, count;
+-
+-	if (!kfd->init_complete)
+-		return 0;
+-
+-	mutex_unlock(&kfd->dqm->lock);
+-
+-	ret = kfd_resume(kfd);
+-	if (ret)
+-		return ret;
+-	count = atomic_dec_return(&kfd_locked);
+-	WARN_ONCE(count != 0, "KFD reset ref. error");
+ 	return 0;
+ }
+ 
+-bool kfd_is_locked(void)
+-{
+-	return  (atomic_read(&kfd_locked) > 0);
+-}
+-
+ void kgd2kfd_suspend(struct kfd_dev *kfd)
+ {
+ 	if (!kfd->init_complete)
+ 		return;
+ 
+ 	/* For first KFD device suspend all the KFD processes */
+-	if (atomic_inc_return(&kfd_locked) == 1)
++	if (atomic_inc_return(&kfd_device_suspended) == 1)
+ 		kfd_suspend_all_processes();
+ 
+ 	kfd->dqm->ops.stop(kfd->dqm);
+@@ -656,7 +582,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
+ 	if (ret)
+ 		return ret;
+ 
+-	count = atomic_dec_return(&kfd_locked);
++	count = atomic_dec_return(&kfd_device_suspended);
+ 	WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
+ 	if (count == 0)
+ 		ret = kfd_resume_all_processes();
+@@ -704,19 +630,19 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
+ 
+ 	spin_lock(&kfd->interrupt_lock);
+ 
+-	if (kfd->interrupts_active
+-	    && interrupt_is_wanted(kfd, ih_ring_entry,
+-				   patched_ihre, &is_patched)
++	if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry,
++						patched_ihre, &is_patched)
+ 	    && enqueue_ih_ring_entry(kfd,
+-				     is_patched ? patched_ihre : ih_ring_entry))
++				is_patched ? patched_ihre : ih_ring_entry))
+ 		queue_work(kfd->ih_wq, &kfd->interrupt_work);
+ 
+ 	spin_unlock(&kfd->interrupt_lock);
+ }
+ 
+-int kgd2kfd_quiesce_mm(struct mm_struct *mm)
++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+ {
+ 	struct kfd_process *p;
++	struct kfd_process_device *pdd;
+ 	int r;
+ 
+ 	/* Because we are called from arbitrary context (workqueue) as opposed
+@@ -725,17 +651,26 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
+ 	 */
+ 	p = kfd_lookup_process_by_mm(mm);
+ 	if (!p)
+-		return -ESRCH;
++		return -ENODEV;
+ 
+-	r = kfd_process_evict_queues(p);
++	if (kfd) {
++		r = -ENODEV;
++		pdd = kfd_get_process_device_data(kfd, p);
++		if (pdd)
++			r = kfd->dqm->ops.evict_process_queues(kfd->dqm,
++							       &pdd->qpd);
++	} else {
++		r = kfd_process_evict_queues(p);
++	}
+ 
+ 	kfd_unref_process(p);
+ 	return r;
+ }
+ 
+-int kgd2kfd_resume_mm(struct mm_struct *mm)
++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+ {
+ 	struct kfd_process *p;
++	struct kfd_process_device *pdd;
+ 	int r;
+ 
+ 	/* Because we are called from arbitrary context (workqueue) as opposed
+@@ -744,9 +679,17 @@ int kgd2kfd_resume_mm(struct mm_struct *mm)
+ 	 */
+ 	p = kfd_lookup_process_by_mm(mm);
+ 	if (!p)
+-		return -ESRCH;
++		return -ENODEV;
+ 
+-	r = kfd_process_restore_queues(p);
++	if (kfd) {
++		r = -ENODEV;
++		pdd = kfd_get_process_device_data(kfd, p);
++		if (pdd)
++			r = kfd->dqm->ops.restore_process_queues(kfd->dqm,
++								 &pdd->qpd);
++	} else {
++		r = kfd_process_restore_queues(p);
++	}
+ 
+ 	kfd_unref_process(p);
+ 	return r;
+@@ -981,26 +924,3 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
+ 	kfree(mem_obj);
+ 	return 0;
+ }
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-/* This function will send a package to HIQ to hang the HWS
+- * which will trigger a GPU reset and bring the HWS back to normal state
+- */
+-int kfd_debugfs_hang_hws(struct kfd_dev *dev)
+-{
+-	int r = 0;
+-
+-	if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) {
+-		pr_err("HWS is not enabled");
+-		return -EINVAL;
+-	}
+-
+-	r = pm_debugfs_hang_hws(&dev->dqm->packets);
+-	if (!r)
+-		r = dqm_debugfs_execute_queues(dev->dqm);
+-
+-	return r;
+-}
+-
+-#endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index ae6f7d8..8c04f7a2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -21,11 +21,10 @@
+  *
+  */
+ 
+-#include <linux/ratelimit.h>
+-#include <linux/printk.h>
+ #include <linux/slab.h>
+ #include <linux/list.h>
+ #include <linux/types.h>
++#include <linux/printk.h>
+ #include <linux/bitops.h>
+ #include <linux/sched.h>
+ #include "kfd_priv.h"
+@@ -61,8 +60,6 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+ 				unsigned int sdma_queue_id);
+ 
+-static void kfd_process_hw_exception(struct work_struct *work);
+-
+ static inline
+ enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
+ {
+@@ -109,7 +106,7 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
+ unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
+ {
+ 	return dqm->dev->device_info->num_sdma_engines
+-			* dqm->dev->device_info->num_sdma_queues_per_engine;
++			* KFD_SDMA_QUEUES_PER_ENGINE;
+ }
+ 
+ void program_sh_mem_settings(struct device_queue_manager *dqm,
+@@ -200,7 +197,7 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
+ 			qpd->vmid,
+ 			qpd->page_table_base);
+-	/* invalidate the VM context after pasid and vmid mapping is set up */
++	/*invalidate the VM context after pasid and vmid mapping is set up*/
+ 	kfd_flush_tlb(qpd_to_pdd(qpd));
+ 
+ 	return 0;
+@@ -209,19 +206,16 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
+ 				struct qcm_process_device *qpd)
+ {
+-	const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf;
+-	int ret;
++	uint32_t len;
+ 
+ 	if (!qpd->ib_kaddr)
+ 		return -ENOMEM;
+ 
+-	ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+-	if (ret)
+-		return ret;
++	len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base,
++						 (uint32_t *)qpd->ib_kaddr);
+ 
+ 	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
+-				qpd->ib_base, (uint32_t *)qpd->ib_kaddr,
+-				pmf->release_mem_size / sizeof(uint32_t));
++				qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
+ }
+ 
+ static void deallocate_vmid(struct device_queue_manager *dqm,
+@@ -290,6 +284,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (retval) {
+ 		if (list_empty(&qpd->queues_list))
+ 			deallocate_vmid(dqm, qpd, q);
++
+ 		goto out_unlock;
+ 	}
+ 
+@@ -359,10 +354,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	int retval;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+-	if (!mqd_mgr)
++	mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
++	if (!mqd)
+ 		return -ENOMEM;
+ 
+ 	retval = allocate_hqd(dqm, q);
+@@ -373,7 +368,7 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (retval)
+ 		goto out_deallocate_hqd;
+ 
+-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
++	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+ 		goto out_deallocate_doorbell;
+@@ -387,15 +382,15 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (!q->properties.is_active)
+ 		return 0;
+ 
+-	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
+-			&q->properties, q->process->mm);
++	retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties,
++			       q->process->mm);
+ 	if (retval)
+ 		goto out_uninit_mqd;
+ 
+ 	return 0;
+ 
+ out_uninit_mqd:
+-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
++	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ out_deallocate_doorbell:
+ 	deallocate_doorbell(qpd, q);
+ out_deallocate_hqd:
+@@ -412,11 +407,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
+ 				struct queue *q)
+ {
+ 	int retval;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++	mqd = dqm->ops.get_mqd_manager(dqm,
+ 		get_mqd_type_from_queue_type(q->properties.type));
+-	if (!mqd_mgr)
++	if (!mqd)
+ 		return -ENOMEM;
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
+@@ -433,14 +428,14 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
+ 
+ 	deallocate_doorbell(qpd, q);
+ 
+-	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
++	retval = mqd->destroy_mqd(mqd, q->mqd,
+ 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+ 				KFD_UNMAP_LATENCY_MS,
+ 				q->pipe, q->queue);
+ 	if (retval == -ETIME)
+ 		qpd->reset_wavefronts = true;
+ 
+-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
++	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 
+ 	list_del(&q->list);
+ 	if (list_empty(&qpd->queues_list)) {
+@@ -480,19 +475,21 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ {
+ 	int retval;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	struct kfd_process_device *pdd;
++
+ 	bool prev_active = false;
+ 
+ 	mutex_lock(&dqm->lock);
++
+ 	pdd = kfd_get_process_device_data(q->device, q->process);
+ 	if (!pdd) {
+ 		retval = -ENODEV;
+ 		goto out_unlock;
+ 	}
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+-	if (!mqd_mgr) {
++	if (!mqd) {
+ 		retval = -ENOMEM;
+ 		goto out_unlock;
+ 	}
+@@ -500,7 +497,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 	 * Eviction state logic: we only mark active queues as evicted
+ 	 * to avoid the overhead of restoring inactive queues later
+ 	 */
+-	if (pdd->qpd.evicted)
++	if (pdd->qpd.evicted > 0)
+ 		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+ 					    q->properties.queue_percent > 0 &&
+ 					    q->properties.queue_address != 0);
+@@ -519,7 +516,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 	} else if (prev_active &&
+ 		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+ 		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+-		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
++		retval = mqd->destroy_mqd(mqd, q->mqd,
+ 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+ 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+ 		if (retval) {
+@@ -528,7 +525,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 		}
+ 	}
+ 
+-	retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
++	retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
+ 
+ 	/*
+ 	 * check active state vs. the previous state and modify
+@@ -546,7 +543,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 	else if (q->properties.is_active &&
+ 		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+ 		  q->properties.type == KFD_QUEUE_TYPE_SDMA))
+-		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
++		retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue,
+ 				       &q->properties, q->process->mm);
+ 
+ out_unlock:
+@@ -557,29 +554,29 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ static struct mqd_manager *get_mqd_manager(
+ 		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
+ {
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 
+ 	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ 		return NULL;
+ 
+ 	pr_debug("mqd type %d\n", type);
+ 
+-	mqd_mgr = dqm->mqd_mgrs[type];
+-	if (!mqd_mgr) {
+-		mqd_mgr = mqd_manager_init(type, dqm->dev);
+-		if (!mqd_mgr)
++	mqd = dqm->mqds[type];
++	if (!mqd) {
++		mqd = mqd_manager_init(type, dqm->dev);
++		if (!mqd)
+ 			pr_err("mqd manager is NULL");
+-		dqm->mqd_mgrs[type] = mqd_mgr;
++		dqm->mqds[type] = mqd;
+ 	}
+ 
+-	return mqd_mgr;
++	return mqd;
+ }
+ 
+ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	struct queue *q;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	struct kfd_process_device *pdd;
+ 	int retval = 0;
+ 
+@@ -595,16 +592,16 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 	list_for_each_entry(q, &qpd->queues_list, list) {
+ 		if (!q->properties.is_active)
+ 			continue;
+-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++		mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd_mgr) { /* should not be here */
++		if (!mqd) { /* should not be here */
+ 			pr_err("Cannot evict queue, mqd mgr is NULL\n");
+ 			retval = -ENOMEM;
+ 			goto out;
+ 		}
+ 		q->properties.is_evicted = true;
+ 		q->properties.is_active = false;
+-		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
++		retval = mqd->destroy_mqd(mqd, q->mqd,
+ 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+ 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+ 		if (retval)
+@@ -654,9 +651,9 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 					  struct qcm_process_device *qpd)
+ {
+ 	struct queue *q;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	struct kfd_process_device *pdd;
+-	uint64_t pd_base;
++	uint32_t pd_base;
+ 	int retval = 0;
+ 
+ 	pdd = qpd_to_pdd(qpd);
+@@ -676,7 +673,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 
+ 	/* Update PD Base in QPD */
+ 	qpd->page_table_base = pd_base;
+-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
++	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+ 
+ 	if (!list_empty(&qpd->queues_list)) {
+ 		dqm->dev->kfd2kgd->set_vm_context_page_table_base(
+@@ -690,16 +687,16 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 	list_for_each_entry(q, &qpd->queues_list, list) {
+ 		if (!q->properties.is_evicted)
+ 			continue;
+-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++		mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd_mgr) { /* should not be here */
++		if (!mqd) { /* should not be here */
+ 			pr_err("Cannot restore queue, mqd mgr is NULL\n");
+ 			retval = -ENOMEM;
+ 			goto out;
+ 		}
+ 		q->properties.is_evicted = false;
+ 		q->properties.is_active = true;
+-		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
++		retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+ 				       q->queue, &q->properties,
+ 				       q->process->mm);
+ 		if (retval)
+@@ -717,7 +714,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
+ {
+ 	struct queue *q;
+ 	struct kfd_process_device *pdd;
+-	uint64_t pd_base;
++	uint32_t pd_base;
+ 	int retval = 0;
+ 
+ 	pdd = qpd_to_pdd(qpd);
+@@ -737,7 +734,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
+ 
+ 	/* Update PD Base in QPD */
+ 	qpd->page_table_base = pd_base;
+-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
++	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+ 
+ 	/* activate all active queues on the qpd */
+ 	list_for_each_entry(q, &qpd->queues_list, list) {
+@@ -760,9 +757,9 @@ static int register_process(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	struct device_process_node *n;
+-	struct kfd_process_device *pdd;
+-	uint64_t pd_base;
+ 	int retval;
++	struct kfd_process_device *pdd;
++	uint32_t pd_base;
+ 
+ 	n = kzalloc(sizeof(*n), GFP_KERNEL);
+ 	if (!n)
+@@ -779,7 +776,7 @@ static int register_process(struct device_queue_manager *dqm,
+ 
+ 	/* Update PD Base in QPD */
+ 	qpd->page_table_base = pd_base;
+-	pr_debug("Updated PD address to 0x%llx\n", pd_base);
++	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+ 
+ 	retval = dqm->asic_ops.update_qpd(dqm, qpd);
+ 
+@@ -880,7 +877,7 @@ static void uninitialize(struct device_queue_manager *dqm)
+ 
+ 	kfree(dqm->allocated_queues);
+ 	for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
+-		kfree(dqm->mqd_mgrs[i]);
++		kfree(dqm->mqds[i]);
+ 	mutex_destroy(&dqm->lock);
+ 	kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
+ }
+@@ -888,7 +885,7 @@ static void uninitialize(struct device_queue_manager *dqm)
+ static int start_nocpsch(struct device_queue_manager *dqm)
+ {
+ 	init_interrupts(dqm);
+-	return pm_init(&dqm->packets, dqm);
++	return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
+ }
+ 
+ static int stop_nocpsch(struct device_queue_manager *dqm)
+@@ -924,11 +921,11 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct queue *q,
+ 					struct qcm_process_device *qpd)
+ {
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	int retval;
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
+-	if (!mqd_mgr)
++	mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
++	if (!mqd)
+ 		return -ENOMEM;
+ 
+ 	retval = allocate_sdma_queue(dqm, &q->sdma_id);
+@@ -947,20 +944,19 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+ 
+ 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
++	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+ 		goto out_deallocate_doorbell;
+ 
+-	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
+-				NULL);
++	retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL);
+ 	if (retval)
+ 		goto out_uninit_mqd;
+ 
+ 	return 0;
+ 
+ out_uninit_mqd:
+-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
++	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ out_deallocate_doorbell:
+ 	deallocate_doorbell(qpd, q);
+ out_deallocate_sdma_queue:
+@@ -1025,8 +1021,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
+ 	dqm->active_runlist = false;
+ 	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+ 
+-	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
+-
+ 	return 0;
+ }
+ 
+@@ -1036,7 +1030,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 
+ 	retval = 0;
+ 
+-	retval = pm_init(&dqm->packets, dqm);
++	retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
+ 	if (retval)
+ 		goto fail_packet_manager_init;
+ 
+@@ -1059,8 +1053,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 	init_interrupts(dqm);
+ 
+ 	mutex_lock(&dqm->lock);
+-	/* clear hang status when driver try to start the hw scheduler */
+-	dqm->is_hws_hang = false;
+ 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+ 	mutex_unlock(&dqm->lock);
+ 
+@@ -1075,7 +1067,9 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ static int stop_cpsch(struct device_queue_manager *dqm)
+ {
+ 	mutex_lock(&dqm->lock);
++
+ 	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
++
+ 	mutex_unlock(&dqm->lock);
+ 
+ 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+@@ -1136,7 +1130,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 			struct qcm_process_device *qpd)
+ {
+ 	int retval;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 
+ 	retval = 0;
+ 
+@@ -1163,10 +1157,10 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 	if (retval)
+ 		goto out_deallocate_sdma_queue;
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+ 
+-	if (!mqd_mgr) {
++	if (!mqd) {
+ 		retval = -ENOMEM;
+ 		goto out_deallocate_doorbell;
+ 	}
+@@ -1183,7 +1177,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 
+ 	q->properties.tba_addr = qpd->tba_addr;
+ 	q->properties.tma_addr = qpd->tma_addr;
+-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
++	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+ 		goto out_deallocate_doorbell;
+@@ -1230,13 +1224,6 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 	while (*fence_addr != fence_value) {
+ 		if (time_after(jiffies, end_jiffies)) {
+ 			pr_err("qcm fence wait loop timeout expired\n");
+-			/* In HWS case, this is used to halt the driver thread
+-			 * in order not to mess up CP states before doing
+-			 * scandumps for FW debugging.
+-			 */
+-			while (halt_if_hws_hang)
+-				schedule();
+-
+ 			return -ETIME;
+ 		}
+ 		schedule();
+@@ -1281,8 +1268,6 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+ {
+ 	int retval = 0;
+ 
+-	if (dqm->is_hws_hang)
+-		return -EIO;
+ 	if (!dqm->active_runlist)
+ 		return retval;
+ 
+@@ -1321,13 +1306,9 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
+ {
+ 	int retval;
+ 
+-	if (dqm->is_hws_hang)
+-		return -EIO;
+ 	retval = unmap_queues_cpsch(dqm, filter, filter_param);
+ 	if (retval) {
+ 		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+-		dqm->is_hws_hang = true;
+-		schedule_work(&dqm->hw_exception_work);
+ 		return retval;
+ 	}
+ 
+@@ -1339,7 +1320,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 				struct queue *q)
+ {
+ 	int retval;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	bool preempt_all_queues;
+ 
+ 	preempt_all_queues = false;
+@@ -1359,9 +1340,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 
+ 	}
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+-	if (!mqd_mgr) {
++	if (!mqd) {
+ 		retval = -ENOMEM;
+ 		goto failed;
+ 	}
+@@ -1382,7 +1363,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 		if (retval == -ETIME)
+ 			qpd->reset_wavefronts = true;
+ 
+-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
++	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 
+ 	/*
+ 	 * Unconditionally decrement this counter, regardless of the queue's
+@@ -1531,7 +1512,7 @@ static int get_wave_state(struct device_queue_manager *dqm,
+ 			  u32 *ctl_stack_used_size,
+ 			  u32 *save_area_used_size)
+ {
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	int r;
+ 
+ 	mutex_lock(&dqm->lock);
+@@ -1542,19 +1523,19 @@ static int get_wave_state(struct device_queue_manager *dqm,
+ 		goto dqm_unlock;
+ 	}
+ 
+-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+-	if (!mqd_mgr) {
++	mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
++	if (!mqd) {
+ 		r = -ENOMEM;
+ 		goto dqm_unlock;
+ 	}
+ 
+-	if (!mqd_mgr->get_wave_state) {
++	if (!mqd->get_wave_state) {
+ 		r = -EINVAL;
+ 		goto dqm_unlock;
+ 	}
+ 
+-	r = mqd_mgr->get_wave_state(mqd_mgr, q->mqd, ctl_stack,
+-			ctl_stack_used_size, save_area_used_size);
++	r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size,
++				save_area_used_size);
+ 
+ dqm_unlock:
+ 	mutex_unlock(&dqm->lock);
+@@ -1567,7 +1548,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
+ 	int retval;
+ 	struct queue *q, *next;
+ 	struct kernel_queue *kq, *kq_next;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd;
+ 	struct device_process_node *cur, *next_dpn;
+ 	enum kfd_unmap_queues_filter filter =
+ 		KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+@@ -1609,7 +1590,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
+ 	}
+ 
+ 	retval = execute_queues_cpsch(dqm, filter, 0);
+-	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
++	if (retval || qpd->reset_wavefronts) {
+ 		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
+ 		dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
+ 		qpd->reset_wavefronts = false;
+@@ -1617,15 +1598,15 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
+ 
+ 	/* lastly, free mqd resources */
+ 	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
++		mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd_mgr) {
++		if (!mqd) {
+ 			retval = -ENOMEM;
+ 			goto out;
+ 		}
+ 		list_del(&q->list);
+ 		qpd->queue_count--;
+-		mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
++		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 	}
+ 
+ out:
+@@ -1644,13 +1625,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		return NULL;
+ 
+ 	switch (dev->device_info->asic_family) {
+-	/* HWS is not available on Hawaii. */
+ 	case CHIP_HAWAII:
+-	/* HWS depends on CWSR for timely dequeue. CWSR is not
+-	 * available on Tonga.
+-	 *
+-	 * FIXME: This argument also applies to Kaveri.
+-	 */
+ 	case CHIP_TONGA:
+ 		dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS;
+ 		break;
+@@ -1729,9 +1704,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		break;
+ 
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 	case CHIP_RAVEN:
+-		device_queue_manager_init_v9(&dqm->asic_ops);
++		device_queue_manager_init_v9_vega10(&dqm->asic_ops);
+ 		break;
+ 	default:
+ 		WARN(1, "Unexpected ASIC family %u",
+@@ -1770,13 +1744,6 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
+ 	return ret;
+ }
+ 
+-static void kfd_process_hw_exception(struct work_struct *work)
+-{
+-	struct device_queue_manager *dqm = container_of(work,
+-			struct device_queue_manager, hw_exception_work);
+-	dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
+-}
+-
+ #if defined(CONFIG_DEBUG_FS)
+ 
+ static void seq_reg_dump(struct seq_file *m,
+@@ -1841,9 +1808,7 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
+ 	}
+ 
+ 	for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
+-		for (queue = 0;
+-		     queue < dqm->dev->device_info->num_sdma_queues_per_engine;
+-		     queue++) {
++		for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) {
+ 			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
+ 				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+ 			if (r)
+@@ -1860,16 +1825,4 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
+ 	return r;
+ }
+ 
+-int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
+-{
+-	int r = 0;
+-
+-	mutex_lock(&dqm->lock);
+-	dqm->active_runlist = true;
+-	r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+-	mutex_unlock(&dqm->lock);
+-
+-	return r;
+-}
+-
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index 1c4ef00..978458a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -31,6 +31,7 @@
+ 
+ #define KFD_UNMAP_LATENCY_MS			(4000)
+ #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
++#define KFD_SDMA_QUEUES_PER_ENGINE		(2)
+ 
+ struct device_process_node {
+ 	struct qcm_process_device *qpd;
+@@ -174,7 +175,7 @@ struct device_queue_manager {
+ 	struct device_queue_manager_ops ops;
+ 	struct device_queue_manager_asic_ops asic_ops;
+ 
+-	struct mqd_manager	*mqd_mgrs[KFD_MQD_TYPE_MAX];
++	struct mqd_manager	*mqds[KFD_MQD_TYPE_MAX];
+ 	struct packet_manager	packets;
+ 	struct kfd_dev		*dev;
+ 	struct mutex		lock;
+@@ -194,10 +195,6 @@ struct device_queue_manager {
+ 	struct kfd_mem_obj	*fence_mem;
+ 	bool			active_runlist;
+ 	int			sched_policy;
+-
+-	/* hw exception  */
+-	bool			is_hws_hang;
+-	struct work_struct	hw_exception_work;
+ };
+ 
+ void device_queue_manager_init_cik(
+@@ -208,7 +205,7 @@ void device_queue_manager_init_vi(
+ 		struct device_queue_manager_asic_ops *asic_ops);
+ void device_queue_manager_init_vi_tonga(
+ 		struct device_queue_manager_asic_ops *asic_ops);
+-void device_queue_manager_init_v9(
++void device_queue_manager_init_v9_vega10(
+ 		struct device_queue_manager_asic_ops *asic_ops);
+ void program_sh_mem_settings(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
+@@ -217,11 +214,18 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
+ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
+ unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+ 
++int process_evict_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd);
++int process_restore_queues(struct device_queue_manager *dqm,
++		struct qcm_process_device *qpd);
++
++
+ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
+ {
+ 	return (pdd->lds_base >> 16) & 0xFF;
+ }
+ 
++/* This function is only useful for GFXv7 and v8 */
+ static inline unsigned int
+ get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd)
+ {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+index 4175153..6198bf2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2018 Advanced Micro Devices, Inc.
++ * Copyright 2016 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -32,7 +32,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
+ static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+ 			    struct qcm_process_device *qpd);
+ 
+-void device_queue_manager_init_v9(
++void device_queue_manager_init_v9_vega10(
+ 	struct device_queue_manager_asic_ops *asic_ops)
+ {
+ 	asic_ops->update_qpd = update_qpd_v9;
+@@ -60,7 +60,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
+ 		qpd->sh_mem_config =
+ 				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+ 					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+-		if (noretry &&
++		if (vega10_noretry &&
+ 		    !dqm->dev->device_info->needs_iommu_device)
+ 			qpd->sh_mem_config |=
+ 				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+index fd60a11..030b014 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+@@ -33,30 +33,26 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+ 				   enum cache_policy alternate_policy,
+ 				   void __user *alternate_aperture_base,
+ 				   uint64_t alternate_aperture_size);
++static int update_qpd_vi(struct device_queue_manager *dqm,
++					struct qcm_process_device *qpd);
++static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
++				struct qcm_process_device *qpd);
++
++/*
++ * Tonga device queue manager functions
++ */
+ static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+ 			struct qcm_process_device *qpd,
+ 			enum cache_policy default_policy,
+ 			enum cache_policy alternate_policy,
+ 			void __user *alternate_aperture_base,
+ 			uint64_t alternate_aperture_size);
+-static int update_qpd_vi(struct device_queue_manager *dqm,
+-					struct qcm_process_device *qpd);
+ static int update_qpd_vi_tonga(struct device_queue_manager *dqm,
+ 			struct qcm_process_device *qpd);
+-static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+-				struct qcm_process_device *qpd);
+ static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+ 			struct queue *q,
+ 			struct qcm_process_device *qpd);
+ 
+-void device_queue_manager_init_vi(
+-		struct device_queue_manager_asic_ops *asic_ops)
+-{
+-	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
+-	asic_ops->update_qpd = update_qpd_vi;
+-	asic_ops->init_sdma_vm = init_sdma_vm;
+-}
+-
+ void device_queue_manager_init_vi_tonga(
+ 		struct device_queue_manager_asic_ops *asic_ops)
+ {
+@@ -65,6 +61,15 @@ void device_queue_manager_init_vi_tonga(
+ 	asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+ }
+ 
++
++void device_queue_manager_init_vi(
++		struct device_queue_manager_asic_ops *asic_ops)
++{
++	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
++	asic_ops->update_qpd = update_qpd_vi;
++	asic_ops->init_sdma_vm = init_sdma_vm;
++}
++
+ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
+ {
+ 	/* In 64-bit mode, we can only control the top 3 bits of the LDS,
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+index ebe79bf..fc41689 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+@@ -115,7 +115,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
+ 	pr_debug("doorbell aperture size  == 0x%08lX\n",
+ 			kfd->shared_resources.doorbell_aperture_size);
+ 
+-	pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr);
++	pr_debug("doorbell kernel address == 0x%p\n", kfd->doorbell_kernel_ptr);
+ 
+ 	return 0;
+ }
+@@ -188,9 +188,9 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+ 	*doorbell_off = kfd->doorbell_id_offset + inx;
+ 
+ 	pr_debug("Get kernel queue doorbell\n"
+-			"     doorbell offset   == 0x%08X\n"
+-			"     doorbell index    == 0x%x\n",
+-		*doorbell_off, inx);
++			 "     doorbell offset   == 0x%08X\n"
++			 "     kernel address    == 0x%p\n",
++		*doorbell_off, (kfd->doorbell_kernel_ptr + inx));
+ 
+ 	return kfd->doorbell_kernel_ptr + inx;
+ }
+@@ -199,8 +199,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
+ {
+ 	unsigned int inx;
+ 
+-	inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr)
+-		* sizeof(u32) / kfd->device_info->doorbell_size;
++	inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr);
+ 
+ 	mutex_lock(&kfd->doorbell_mutex);
+ 	__clear_bit(inx, kfd->doorbell_available_index);
+@@ -211,7 +210,7 @@ void write_kernel_doorbell(void __iomem *db, u32 value)
+ {
+ 	if (db) {
+ 		writel(value, db);
+-		pr_debug("Writing %d to doorbell address %p\n", value, db);
++		pr_debug("Writing %d to doorbell address 0x%p\n", value, db);
+ 	}
+ }
+ 
+@@ -221,10 +220,14 @@ void write_kernel_doorbell64(void __iomem *db, u64 value)
+ 		WARN(((unsigned long)db & 7) != 0,
+ 		     "Unaligned 64-bit doorbell");
+ 		writeq(value, (u64 __iomem *)db);
+-		pr_debug("writing %llu to doorbell address %p\n", value, db);
++		pr_debug("writing %llu to doorbell address 0x%p\n", value, db);
+ 	}
+ }
+ 
++/*
++ * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
++ * to doorbells with the process's doorbell page
++ */
+ unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
+ 					struct kfd_process *process,
+ 					unsigned int doorbell_id)
+@@ -236,8 +239,7 @@ unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
+ 	 * units regardless of the ASIC-dependent doorbell size.
+ 	 */
+ 	return kfd->doorbell_id_offset +
+-		process->doorbell_index
+-		* kfd_doorbell_process_slice(kfd) / sizeof(u32) +
++		process->doorbell_index * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) +
+ 		doorbell_id * kfd->device_info->doorbell_size / sizeof(u32);
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+index 1dc1584..a92ca78 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+@@ -51,8 +51,8 @@ struct kfd_event_waiter {
+  */
+ struct kfd_signal_page {
+ 	uint64_t *kernel_address;
++	uint64_t handle;
+ 	uint64_t __user *user_address;
+-	bool need_to_free_pages;
+ };
+ 
+ 
+@@ -80,7 +80,6 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
+ 	       KFD_SIGNAL_EVENT_LIMIT * 8);
+ 
+ 	page->kernel_address = backing_store;
+-	page->need_to_free_pages = true;
+ 	pr_debug("Allocated new event signal page at %p, for process %p\n",
+ 			page, p);
+ 
+@@ -100,17 +99,9 @@ static int allocate_event_notification_slot(struct kfd_process *p,
+ 		p->signal_page = allocate_signal_page(p);
+ 		if (!p->signal_page)
+ 			return -ENOMEM;
+-		/* Oldest user mode expects 256 event slots */
+-		p->signal_mapped_size = 256*8;
+ 	}
+ 
+-	/*
+-	 * Compatibility with old user mode: Only use signal slots
+-	 * user mode has mapped, may be less than
+-	 * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase
+-	 * of the event limit without breaking user mode.
+-	 */
+-	id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8,
++	id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT,
+ 		       GFP_KERNEL);
+ 	if (id < 0)
+ 		return id;
+@@ -121,6 +112,29 @@ static int allocate_event_notification_slot(struct kfd_process *p,
+ 	return 0;
+ }
+ 
++static struct kfd_signal_page *allocate_signal_page_dgpu(
++	struct kfd_process *p, uint64_t *kernel_address, uint64_t handle)
++{
++	struct kfd_signal_page *my_page;
++
++	my_page = kzalloc(sizeof(*my_page), GFP_KERNEL);
++	if (!my_page)
++		return NULL;
++
++	/* Initialize all events to unsignaled */
++	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
++	       KFD_SIGNAL_EVENT_LIMIT * 8);
++
++	my_page->kernel_address = kernel_address;
++	my_page->handle = handle;
++	my_page->user_address = NULL;
++
++	pr_debug("Allocated new event signal page at %p, for process %p\n",
++			my_page, p);
++
++	return my_page;
++}
++
+ /*
+  * Assumes that p->event_mutex is held and of course that p is not going
+  * away (current or locked).
+@@ -184,8 +198,7 @@ static int create_signal_event(struct file *devkfd,
+ {
+ 	int ret;
+ 
+-	if (p->signal_mapped_size &&
+-	    p->signal_event_count == p->signal_mapped_size / 8) {
++	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
+ 		if (!p->signal_event_limit_reached) {
+ 			pr_warn("Signal event wasn't created because limit was reached\n");
+ 			p->signal_event_limit_reached = true;
+@@ -271,9 +284,9 @@ static void shutdown_signal_page(struct kfd_process *p)
+ 	struct kfd_signal_page *page = p->signal_page;
+ 
+ 	if (page) {
+-		if (page->need_to_free_pages)
++		if (page->user_address)
+ 			free_pages((unsigned long)page->kernel_address,
+-				   get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
++					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ 		kfree(page);
+ 	}
+ }
+@@ -295,34 +308,11 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
+ 	return ev->type == KFD_EVENT_TYPE_SIGNAL;
+ }
+ 
+-int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
+-		       uint64_t size)
+-{
+-	struct kfd_signal_page *page;
+-
+-	if (p->signal_page)
+-		return -EBUSY;
+-
+-	page = kzalloc(sizeof(*page), GFP_KERNEL);
+-	if (!page)
+-		return -ENOMEM;
+-
+-	/* Initialize all events to unsignaled */
+-	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
+-	       KFD_SIGNAL_EVENT_LIMIT * 8);
+-
+-	page->kernel_address = kernel_address;
+-
+-	p->signal_page = page;
+-	p->signal_mapped_size = size;
+-
+-	return 0;
+-}
+-
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+ 		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index)
++		     uint64_t *event_page_offset, uint32_t *event_slot_index,
++		     void *kern_addr)
+ {
+ 	int ret = 0;
+ 	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+@@ -336,10 +326,19 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 
+ 	init_waitqueue_head(&ev->wq);
+ 
+-	*event_page_offset = 0;
+-
+ 	mutex_lock(&p->event_mutex);
+ 
++	if (kern_addr && !p->signal_page) {
++		p->signal_page = allocate_signal_page_dgpu(p, kern_addr,
++							   *event_page_offset);
++		if (!p->signal_page) {
++			ret = -ENOMEM;
++			goto out;
++		}
++	}
++
++	*event_page_offset = 0;
++
+ 	switch (event_type) {
+ 	case KFD_EVENT_TYPE_SIGNAL:
+ 	case KFD_EVENT_TYPE_DEBUG:
+@@ -362,6 +361,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		kfree(ev);
+ 	}
+ 
++out:
+ 	mutex_unlock(&p->event_mutex);
+ 
+ 	return ret;
+@@ -390,11 +390,7 @@ static void set_event(struct kfd_event *ev)
+ {
+ 	struct kfd_event_waiter *waiter;
+ 
+-	/* Auto reset if the list is non-empty and we're waking
+-	* someone. waitqueue_active is safe here because we're
+-	* protected by the p->event_mutex, which is also held when
+-	* updating the wait queues in kfd_wait_on_events.
+-	*/
++	/* Auto reset if the list is non-empty and we're waking someone. */
+ 	ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
+ 
+ 	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
+@@ -781,12 +777,12 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 
+ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ {
++
+ 	unsigned long pfn;
+ 	struct kfd_signal_page *page;
+-	int ret;
+ 
+-	/* check required size doesn't exceed the allocated size */
+-	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) <
++	/* check required size is logical */
++	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) !=
+ 			get_order(vma->vm_end - vma->vm_start)) {
+ 		pr_err("Event page mmap requested illegal size\n");
+ 		return -EINVAL;
+@@ -816,12 +812,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ 	page->user_address = (uint64_t __user *)vma->vm_start;
+ 
+ 	/* mapping the page to user process */
+-	ret = remap_pfn_range(vma, vma->vm_start, pfn,
++	return remap_pfn_range(vma, vma->vm_start, pfn,
+ 			vma->vm_end - vma->vm_start, vma->vm_page_prot);
+-	if (!ret)
+-		p->signal_mapped_size = vma->vm_end - vma->vm_start;
+-
+-	return ret;
+ }
+ 
+ /*
+@@ -1012,30 +1004,3 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ 	mutex_unlock(&p->event_mutex);
+ 	kfd_unref_process(p);
+ }
+-
+-void kfd_signal_reset_event(struct kfd_dev *dev)
+-{
+-	struct kfd_hsa_hw_exception_data hw_exception_data;
+-	struct kfd_process *p;
+-	struct kfd_event *ev;
+-	unsigned int temp;
+-	uint32_t id, idx;
+-
+-	/* Whole gpu reset caused by GPU hang , and  memory is lost */
+-	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
+-	hw_exception_data.gpu_id = dev->id;
+-	hw_exception_data.memory_lost = 1;
+-
+-	idx  = srcu_read_lock(&kfd_processes_srcu);
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		mutex_lock(&p->event_mutex);
+-		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+-		idr_for_each_entry_continue(&p->event_idr, ev, id)
+-			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
+-				ev->hw_exception_data = hw_exception_data;
+-				set_event(ev);
+-			}
+-		mutex_unlock(&p->event_mutex);
+-	}
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+index c7ac6c7..abca5bf 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+@@ -66,7 +66,6 @@ struct kfd_event {
+ 	/* type specific data */
+ 	union {
+ 		struct kfd_hsa_memory_exception_data memory_exception_data;
+-		struct kfd_hsa_hw_exception_data hw_exception_data;
+ 	};
+ };
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+index 8f123a2..2c00711 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+@@ -289,6 +289,7 @@
+ 
+ #define MAKE_LDS_APP_BASE_VI() \
+ 	(((uint64_t)(0x1UL) << 61) + 0x0)
++
+ #define MAKE_LDS_APP_LIMIT(base) \
+ 	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+ 
+@@ -312,7 +313,17 @@
+ #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
+ #define SVM_IB_BASE   (SVM_CWSR_BASE - PAGE_SIZE)
+ 
+-static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
++					uint64_t base, uint64_t limit)
++{
++	if (base < SVM_USER_BASE) {
++		pr_err("Set dgpu vm base 0x%llx failed.\n", base);
++		return -EINVAL;
++	}
++	return 0;
++}
++
++void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
+ {
+ 	/*
+ 	 * node id couldn't be 0 - the three MSB bits of
+@@ -321,42 +332,19 @@ static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
+ 	pdd->lds_base = MAKE_LDS_APP_BASE_VI();
+ 	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+ 
+-	if (!pdd->dev->device_info->needs_iommu_device) {
+-		/* dGPUs: SVM aperture starting at 0
+-		 * with small reserved space for kernel.
+-		 * Set them to CANONICAL addresses.
+-		 */
+-		pdd->gpuvm_base = SVM_USER_BASE;
+-		pdd->gpuvm_limit =
+-			pdd->dev->shared_resources.gpuvm_size - 1;
+-	} else {
+-		/* set them to non CANONICAL addresses, and no SVM is
+-		 * allocated.
+-		 */
+-		pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
+-		pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base,
+-				pdd->dev->shared_resources.gpuvm_size);
+-	}
++	pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
++	pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
++		pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size);
+ 
+ 	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
+ 	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+ }
+ 
+-static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
++void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
+ {
+ 	pdd->lds_base = MAKE_LDS_APP_BASE_V9();
+ 	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+ 
+-	/* Raven needs SVM to support graphic handle, etc. Leave the small
+-	 * reserved space before SVM on Raven as well, even though we don't
+-	 * have to.
+-	 * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they
+-	 * are used in Thunk to reserve SVM.
+-	 */
+-	pdd->gpuvm_base = SVM_USER_BASE;
+-	pdd->gpuvm_limit =
+-		pdd->dev->shared_resources.gpuvm_size - 1;
+-
+ 	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
+ 	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+ }
+@@ -377,10 +365,10 @@ int kfd_init_apertures(struct kfd_process *process)
+ 		pdd = kfd_create_process_device_data(dev, process);
+ 		if (!pdd) {
+ 			pr_err("Failed to create process device data\n");
+-			return -ENOMEM;
++			return -1;
+ 		}
+ 		/*
+-		 * For 64 bit process apertures will be statically reserved in
++		 * For 64 bit process aperture will be statically reserved in
+ 		 * the x86_64 non canonical process address space
+ 		 * amdkfd doesn't currently support apertures for 32 bit process
+ 		 */
+@@ -400,20 +388,21 @@ int kfd_init_apertures(struct kfd_process *process)
+ 				kfd_init_apertures_vi(pdd, id);
+ 				break;
+ 			case CHIP_VEGA10:
+-			case CHIP_VEGA20:
+ 			case CHIP_RAVEN:
+ 				kfd_init_apertures_v9(pdd, id);
+ 				break;
+ 			default:
+-				WARN(1, "Unexpected ASIC family %u",
+-				     dev->device_info->asic_family);
+-				return -EINVAL;
++				pr_err("Unknown chip in kfd_init_apertures\n");
++				return -1;
+ 			}
+ 
+ 			if (!dev->device_info->needs_iommu_device) {
+-				/* dGPUs: the reserved space for kernel
+-				 * before SVM
++				/* dGPUs: SVM aperture starting at 0
++				 * with small reserved space for kernel
+ 				 */
++				pdd->gpuvm_base = SVM_USER_BASE;
++				pdd->gpuvm_limit =
++					dev->shared_resources.gpuvm_size - 1;
+ 				pdd->qpd.cwsr_base = SVM_CWSR_BASE;
+ 				pdd->qpd.ib_base = SVM_IB_BASE;
+ 			}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+index f836897..009d6f4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2018 Advanced Micro Devices, Inc.
++ * Copyright 2016 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -25,43 +25,70 @@
+ #include "soc15_int.h"
+ 
+ 
++static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid)
++{
++	uint32_t pasid = 0;
++	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
++
++	if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid))
++		pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);
++
++	return pasid;
++}
++
+ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
+ 					const uint32_t *ih_ring_entry,
+ 					uint32_t *patched_ihre,
+ 					bool *patched_flag)
+ {
+ 	uint16_t source_id, client_id, pasid, vmid;
+-	const uint32_t *data = ih_ring_entry;
++	bool result = false;
+ 
+-	/* Only handle interrupts from KFD VMIDs */
++	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
++	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
++	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+ 	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+-	if (vmid < dev->vm_info.first_vmid_kfd ||
+-	    vmid > dev->vm_info.last_vmid_kfd)
+-		return 0;
+ 
+-	/* If there is no valid PASID, it's likely a firmware bug */
+-	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+-	if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt"))
+-		return 0;
++	if (pasid) {
++		const uint32_t *data = ih_ring_entry;
+ 
+-	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+-	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
++		pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
++			 client_id, source_id, pasid);
++		pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
++			 data[0], data[1], data[2], data[3],
++			 data[4], data[5], data[6], data[7]);
++	}
++
++	if ((vmid >= dev->vm_info.first_vmid_kfd &&
++	     vmid <= dev->vm_info.last_vmid_kfd) &&
++	    (source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
++	     source_id == SOC15_INTSRC_SDMA_TRAP ||
++	     source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
++	     source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
++	     client_id == SOC15_IH_CLIENTID_VMC ||
++	     client_id == SOC15_IH_CLIENTID_UTCL2)) {
++
++		/*
++		 * KFD want to handle this INT, but MEC firmware did
++		 * not send pasid. Try to get it from vmid mapping
++		 * and patch the ih entry. It's a temp workaround.
++		 */
++		WARN_ONCE((!pasid), "Fix me.\n");
++		if (!pasid) {
++			uint32_t temp = le32_to_cpu(ih_ring_entry[3]);
++
++			pasid = kfd_get_pasid_from_vmid(dev, vmid);
++			memcpy(patched_ihre, ih_ring_entry,
++			       dev->device_info->ih_ring_entry_size);
++			patched_ihre[3] = cpu_to_le32(temp | pasid);
++			*patched_flag = true;
++		}
++		result = pasid ? true : false;
++	}
++
++	/* Do not process in ISR, just request it to be forwarded to WQ. */
++	return result;
+ 
+-	pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
+-		 client_id, source_id, pasid);
+-	pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+-		 data[0], data[1], data[2], data[3],
+-		 data[4], data[5], data[6], data[7]);
+-
+-	/* Interrupt types we care about: various signals and faults.
+-	 * They will be forwarded to a work queue (see below).
+-	 */
+-	return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+-		source_id == SOC15_INTSRC_SDMA_TRAP ||
+-		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+-		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+-		client_id == SOC15_IH_CLIENTID_VMC ||
+-		client_id == SOC15_IH_CLIENTID_UTCL2;
+ }
+ 
+ static void event_interrupt_wq_v9(struct kfd_dev *dev,
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+index 7a61f38..5b798f9 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+@@ -75,8 +75,7 @@ int kfd_iommu_device_init(struct kfd_dev *kfd)
+ 	}
+ 
+ 	if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) {
+-		dev_err(kfd_device,
+-			"error required iommu flags ats %i, pri %i, pasid %i\n",
++		dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n",
+ 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0,
+ 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0,
+ 		       (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP)
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+index a53d954..97806ed 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+@@ -140,7 +140,7 @@ static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev,
+ 		goto err_unlock;
+ 
+ 	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+-							  va_addr, size, 0, 0,
++							  va_addr, size,
+ 							  ipc_obj);
+ 	if (idr_handle < 0) {
+ 		r = -EFAULT;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+index e78445d..8cf9d44 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+@@ -59,7 +59,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	switch (type) {
+ 	case KFD_QUEUE_TYPE_DIQ:
+ 	case KFD_QUEUE_TYPE_HIQ:
+-		kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
++		kq->mqd = dev->dqm->ops.get_mqd_manager(dev->dqm,
+ 						KFD_MQD_TYPE_HIQ);
+ 		break;
+ 	default:
+@@ -67,7 +67,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 		return false;
+ 	}
+ 
+-	if (!kq->mqd_mgr)
++	if (!kq->mqd)
+ 		return false;
+ 
+ 	prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off);
+@@ -131,7 +131,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	kq->queue->device = dev;
+ 	kq->queue->process = kfd_get_process(current);
+ 
+-	retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
++	retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd,
+ 					&kq->queue->mqd_mem_obj,
+ 					&kq->queue->gart_mqd_addr,
+ 					&kq->queue->properties);
+@@ -143,9 +143,9 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 		pr_debug("Assigning hiq to hqd\n");
+ 		kq->queue->pipe = KFD_CIK_HIQ_PIPE;
+ 		kq->queue->queue = KFD_CIK_HIQ_QUEUE;
+-		kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd,
+-				kq->queue->pipe, kq->queue->queue,
+-				&kq->queue->properties, NULL);
++		kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe,
++				  kq->queue->queue, &kq->queue->properties,
++				  NULL);
+ 	} else {
+ 		/* allocate fence for DIQ */
+ 
+@@ -183,7 +183,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ static void uninitialize(struct kernel_queue *kq)
+ {
+ 	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+-		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
++		kq->mqd->destroy_mqd(kq->mqd,
+ 					kq->queue->mqd,
+ 					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+ 					KFD_UNMAP_LATENCY_MS,
+@@ -192,8 +192,7 @@ static void uninitialize(struct kernel_queue *kq)
+ 	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
+ 		kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
+ 
+-	kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd,
+-				kq->queue->mqd_mem_obj);
++	kq->mqd->uninit_mqd(kq->mqd, kq->queue->mqd, kq->queue->mqd_mem_obj);
+ 
+ 	kfd_gtt_sa_free(kq->dev, kq->rptr_mem);
+ 	kfd_gtt_sa_free(kq->dev, kq->wptr_mem);
+@@ -316,13 +315,7 @@ static void submit_packet(struct kernel_queue *kq)
+ 
+ static void rollback_packet(struct kernel_queue *kq)
+ {
+-	if (kq->dev->device_info->doorbell_size == 8) {
+-		kq->pending_wptr64 = *kq->wptr64_kernel;
+-		kq->pending_wptr = *kq->wptr_kernel %
+-			(kq->queue->properties.queue_size / 4);
+-	} else {
+-		kq->pending_wptr = *kq->wptr_kernel;
+-	}
++	kq->pending_wptr = *kq->queue->properties.write_ptr;
+ }
+ 
+ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+@@ -356,7 +349,6 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+ 		break;
+ 
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 	case CHIP_RAVEN:
+ 		kernel_queue_init_v9(&kq->ops_asic_specific);
+ 		break;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+index 384d7a3..82c94a6 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+@@ -80,7 +80,7 @@ struct kernel_queue {
+ 
+ 	/* data */
+ 	struct kfd_dev		*dev;
+-	struct mqd_manager	*mqd_mgr;
++	struct mqd_manager	*mqd;
+ 	struct queue		*queue;
+ 	uint64_t		pending_wptr64;
+ 	uint32_t		pending_wptr;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+index 19e54ac..2808422 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+@@ -22,6 +22,8 @@
+  */
+ 
+ #include "kfd_kernel_queue.h"
++#include "kfd_pm4_headers.h"
++#include "kfd_pm4_opcodes.h"
+ 
+ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+ 			enum kfd_queue_type type, unsigned int queue_size);
+@@ -51,3 +53,120 @@ static void submit_packet_cik(struct kernel_queue *kq)
+ 	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+ 				kq->pending_wptr);
+ }
++
++static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer,
++				struct qcm_process_device *qpd)
++{
++	struct pm4_map_process *packet;
++
++	packet = (struct pm4_map_process *)buffer;
++
++	memset(buffer, 0, sizeof(struct pm4_map_process));
++
++	packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS,
++					sizeof(struct pm4_map_process));
++	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
++	packet->bitfields2.process_quantum = 1;
++	packet->bitfields2.pasid = qpd->pqm->process->pasid;
++	packet->bitfields3.page_table_base = qpd->page_table_base;
++	packet->bitfields10.gds_size = qpd->gds_size;
++	packet->bitfields10.num_gws = qpd->num_gws;
++	packet->bitfields10.num_oac = qpd->num_oac;
++	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
++
++	packet->sh_mem_config = qpd->sh_mem_config;
++	packet->sh_mem_bases = qpd->sh_mem_bases;
++	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
++	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
++
++	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
++	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
++
++	return 0;
++}
++
++static int pm_map_process_scratch_cik(struct packet_manager *pm,
++		uint32_t *buffer, struct qcm_process_device *qpd)
++{
++	struct pm4_map_process_scratch_kv *packet;
++
++	packet = (struct pm4_map_process_scratch_kv *)buffer;
++
++	memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv));
++
++	packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS,
++				sizeof(struct pm4_map_process_scratch_kv));
++	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
++	packet->bitfields2.process_quantum = 1;
++	packet->bitfields2.pasid = qpd->pqm->process->pasid;
++	packet->bitfields3.page_table_base = qpd->page_table_base;
++	packet->bitfields14.gds_size = qpd->gds_size;
++	packet->bitfields14.num_gws = qpd->num_gws;
++	packet->bitfields14.num_oac = qpd->num_oac;
++	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
++
++	packet->sh_mem_config = qpd->sh_mem_config;
++	packet->sh_mem_bases = qpd->sh_mem_bases;
++	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
++	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
++
++	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
++
++	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
++	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
++
++	return 0;
++}
++
++static uint32_t pm_get_map_process_packet_size_cik(void)
++{
++	return sizeof(struct pm4_map_process);
++}
++static uint32_t pm_get_map_process_scratch_packet_size_cik(void)
++{
++	return sizeof(struct pm4_map_process_scratch_kv);
++}
++
++
++static struct packet_manager_funcs kfd_cik_pm_funcs = {
++	.map_process			= pm_map_process_cik,
++	.runlist			= pm_runlist_vi,
++	.set_resources			= pm_set_resources_vi,
++	.map_queues			= pm_map_queues_vi,
++	.unmap_queues			= pm_unmap_queues_vi,
++	.query_status			= pm_query_status_vi,
++	.release_mem			= pm_release_mem_vi,
++	.get_map_process_packet_size	= pm_get_map_process_packet_size_cik,
++	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
++	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
++	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
++	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
++	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
++	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
++};
++
++static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = {
++	.map_process			= pm_map_process_scratch_cik,
++	.runlist			= pm_runlist_vi,
++	.set_resources			= pm_set_resources_vi,
++	.map_queues			= pm_map_queues_vi,
++	.unmap_queues			= pm_unmap_queues_vi,
++	.query_status			= pm_query_status_vi,
++	.release_mem			= pm_release_mem_vi,
++	.get_map_process_packet_size	=
++				pm_get_map_process_scratch_packet_size_cik,
++	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
++	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
++	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
++	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
++	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
++	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
++};
++
++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver)
++{
++	if (fw_ver >= KFD_SCRATCH_KV_FW_VER)
++		pm->pmf = &kfd_cik_scratch_pm_funcs;
++	else
++		pm->pmf = &kfd_cik_pm_funcs;
++}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+index 33830b1..5fe4f60 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2018 Advanced Micro Devices, Inc.
++ * Copyright 2016 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -44,7 +44,7 @@ static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	int retval;
+ 
+ 	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+-	if (retval)
++	if (retval != 0)
+ 		return false;
+ 
+ 	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+@@ -71,7 +71,8 @@ static int pm_map_process_v9(struct packet_manager *pm,
+ 		uint32_t *buffer, struct qcm_process_device *qpd)
+ {
+ 	struct pm4_mes_map_process *packet;
+-	uint64_t vm_page_table_base_addr = qpd->page_table_base;
++	uint64_t vm_page_table_base_addr =
++		(uint64_t)(qpd->page_table_base) << 12;
+ 
+ 	packet = (struct pm4_mes_map_process *)buffer;
+ 	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+@@ -125,6 +126,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
+ 	concurrent_proc_cnt = min(pm->dqm->processes_count,
+ 			kfd->max_proc_per_quantum);
+ 
++
+ 	packet = (struct pm4_mes_runlist *)buffer;
+ 
+ 	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+@@ -293,7 +295,7 @@ static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
+ }
+ 
+ 
+-static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
++static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
+ {
+ 	struct pm4_mec_release_mem *packet;
+ 
+@@ -318,22 +320,58 @@ static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
+ 
+ 	packet->data_lo = 0;
+ 
+-	return 0;
++	return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
++}
++
++static uint32_t pm_get_map_process_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mes_map_process);
++}
++
++static uint32_t pm_get_runlist_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mes_runlist);
++}
++
++static uint32_t pm_get_map_queues_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mes_map_queues);
++}
++
++static uint32_t pm_get_unmap_queues_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mes_unmap_queues);
++}
++
++static uint32_t pm_get_query_status_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mes_query_status);
++}
++
++static uint32_t pm_get_release_mem_packet_size_v9(void)
++{
++	return sizeof(struct pm4_mec_release_mem);
+ }
+ 
+-const struct packet_manager_funcs kfd_v9_pm_funcs = {
+-	.map_process		= pm_map_process_v9,
+-	.runlist		= pm_runlist_v9,
+-	.set_resources		= pm_set_resources_vi,
+-	.map_queues		= pm_map_queues_v9,
+-	.unmap_queues		= pm_unmap_queues_v9,
+-	.query_status		= pm_query_status_v9,
+-	.release_mem		= pm_release_mem_v9,
+-	.map_process_size	= sizeof(struct pm4_mes_map_process),
+-	.runlist_size		= sizeof(struct pm4_mes_runlist),
+-	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
+-	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
+-	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+-	.query_status_size	= sizeof(struct pm4_mes_query_status),
+-	.release_mem_size	= sizeof(struct pm4_mec_release_mem)
++static struct packet_manager_funcs kfd_v9_pm_funcs = {
++	.map_process			= pm_map_process_v9,
++	.runlist			= pm_runlist_v9,
++	.set_resources			= pm_set_resources_vi,
++	.map_queues			= pm_map_queues_v9,
++	.unmap_queues			= pm_unmap_queues_v9,
++	.query_status			= pm_query_status_v9,
++	.release_mem			= pm_release_mem_v9,
++	.get_map_process_packet_size	= pm_get_map_process_packet_size_v9,
++	.get_runlist_packet_size	= pm_get_runlist_packet_size_v9,
++	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
++	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_v9,
++	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_v9,
++	.get_query_status_packet_size	= pm_get_query_status_packet_size_v9,
++	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_v9,
+ };
++
++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver)
++{
++	pm->pmf = &kfd_v9_pm_funcs;
++}
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+index bf20c6d..9022ecb 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+@@ -67,25 +67,12 @@ static void submit_packet_vi(struct kernel_queue *kq)
+ 				kq->pending_wptr);
+ }
+ 
+-unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
+-{
+-	union PM4_MES_TYPE_3_HEADER header;
+-
+-	header.u32All = 0;
+-	header.opcode = opcode;
+-	header.count = packet_size / 4 - 2;
+-	header.type = PM4_TYPE_3;
+-
+-	return header.u32All;
+-}
+-
+-static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
+-				struct qcm_process_device *qpd)
++static int pm_map_process_vi(struct packet_manager *pm,
++		uint32_t *buffer, struct qcm_process_device *qpd)
+ {
+ 	struct pm4_mes_map_process *packet;
+ 
+ 	packet = (struct pm4_mes_map_process *)buffer;
+-
+ 	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+ 
+ 	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+@@ -112,16 +99,27 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	return 0;
+ }
+ 
+-static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
++
++unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
++{
++	union PM4_MES_TYPE_3_HEADER header;
++
++	header.u32All = 0;
++	header.opcode = opcode;
++	header.count = packet_size / 4 - 2;
++	header.type = PM4_TYPE_3;
++
++	return header.u32All;
++}
++
++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+ 			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+ {
+ 	struct pm4_mes_runlist *packet;
++
+ 	int concurrent_proc_cnt = 0;
+ 	struct kfd_dev *kfd = pm->dqm->dev;
+ 
+-	if (WARN_ON(!ib))
+-		return -EFAULT;
+-
+ 	/* Determine the number of processes to map together to HW:
+ 	 * it can not exceed the number of VMIDs available to the
+ 	 * scheduler, and it is determined by the smaller of the number
+@@ -134,6 +132,7 @@ static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	concurrent_proc_cnt = min(pm->dqm->processes_count,
+ 			kfd->max_proc_per_quantum);
+ 
++
+ 	packet = (struct pm4_mes_runlist *)buffer;
+ 
+ 	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+@@ -151,35 +150,7 @@ static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	return 0;
+ }
+ 
+-int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+-				struct scheduling_resources *res)
+-{
+-	struct pm4_mes_set_resources *packet;
+-
+-	packet = (struct pm4_mes_set_resources *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
+-					sizeof(struct pm4_mes_set_resources));
+-
+-	packet->bitfields2.queue_type =
+-			queue_type__mes_set_resources__hsa_interface_queue_hiq;
+-	packet->bitfields2.vmid_mask = res->vmid_mask;
+-	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+-	packet->bitfields7.oac_mask = res->oac_mask;
+-	packet->bitfields8.gds_heap_base = res->gds_heap_base;
+-	packet->bitfields8.gds_heap_size = res->gds_heap_size;
+-
+-	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
+-	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
+-
+-	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
+-	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
+-
+-	return 0;
+-}
+-
+-static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ 		struct queue *q, bool is_static)
+ {
+ 	struct pm4_mes_map_queues *packet;
+@@ -238,7 +209,35 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	return 0;
+ }
+ 
+-static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
++int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
++				struct scheduling_resources *res)
++{
++	struct pm4_mes_set_resources *packet;
++
++	packet = (struct pm4_mes_set_resources *)buffer;
++	memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
++
++	packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
++					sizeof(struct pm4_mes_set_resources));
++
++	packet->bitfields2.queue_type =
++			queue_type__mes_set_resources__hsa_interface_queue_hiq;
++	packet->bitfields2.vmid_mask = res->vmid_mask;
++	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
++	packet->bitfields7.oac_mask = res->oac_mask;
++	packet->bitfields8.gds_heap_base = res->gds_heap_base;
++	packet->bitfields8.gds_heap_size = res->gds_heap_size;
++
++	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
++	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
++
++	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
++	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
++
++	return 0;
++}
++
++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ 			enum kfd_queue_type type,
+ 			enum kfd_unmap_queues_filter filter,
+ 			uint32_t filter_param, bool reset,
+@@ -303,7 +302,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+ 
+ }
+ 
+-static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+ 			uint64_t fence_address,	uint32_t fence_value)
+ {
+ 	struct pm4_mes_query_status *packet;
+@@ -311,6 +310,7 @@ static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	packet = (struct pm4_mes_query_status *)buffer;
+ 	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+ 
++
+ 	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+ 					sizeof(struct pm4_mes_query_status));
+ 
+@@ -328,15 +328,16 @@ static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+ 	return 0;
+ }
+ 
+-static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
++
++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
+ {
+ 	struct pm4_mec_release_mem *packet;
+ 
+ 	packet = (struct pm4_mec_release_mem *)buffer;
+-	memset(buffer, 0, sizeof(*packet));
++	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+ 
+ 	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+-						 sizeof(*packet));
++					sizeof(struct pm4_mec_release_mem));
+ 
+ 	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ 	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+@@ -354,22 +355,63 @@ static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
+ 
+ 	packet->data_lo = 0;
+ 
+-	return 0;
++	return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
++}
++
++uint32_t pm_get_map_process_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_map_process);
++}
++
++uint32_t pm_get_runlist_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_runlist);
++}
++
++uint32_t pm_get_set_resources_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_set_resources);
++}
++
++uint32_t pm_get_map_queues_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_map_queues);
++}
++
++uint32_t pm_get_unmap_queues_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_unmap_queues);
++}
++
++uint32_t pm_get_query_status_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mes_query_status);
+ }
+ 
+-const struct packet_manager_funcs kfd_vi_pm_funcs = {
+-	.map_process		= pm_map_process_vi,
+-	.runlist		= pm_runlist_vi,
+-	.set_resources		= pm_set_resources_vi,
+-	.map_queues		= pm_map_queues_vi,
+-	.unmap_queues		= pm_unmap_queues_vi,
+-	.query_status		= pm_query_status_vi,
+-	.release_mem		= pm_release_mem_vi,
+-	.map_process_size	= sizeof(struct pm4_mes_map_process),
+-	.runlist_size		= sizeof(struct pm4_mes_runlist),
+-	.set_resources_size	= sizeof(struct pm4_mes_set_resources),
+-	.map_queues_size	= sizeof(struct pm4_mes_map_queues),
+-	.unmap_queues_size	= sizeof(struct pm4_mes_unmap_queues),
+-	.query_status_size	= sizeof(struct pm4_mes_query_status),
+-	.release_mem_size	= sizeof(struct pm4_mec_release_mem)
++uint32_t pm_get_release_mem_packet_size_vi(void)
++{
++	return sizeof(struct pm4_mec_release_mem);
++}
++
++
++static struct packet_manager_funcs kfd_vi_pm_funcs = {
++	.map_process			= pm_map_process_vi,
++	.runlist			= pm_runlist_vi,
++	.set_resources			= pm_set_resources_vi,
++	.map_queues			= pm_map_queues_vi,
++	.unmap_queues			= pm_unmap_queues_vi,
++	.query_status			= pm_query_status_vi,
++	.release_mem			= pm_release_mem_vi,
++	.get_map_process_packet_size	= pm_get_map_process_packet_size_vi,
++	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
++	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
++	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
++	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
++	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
++	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
+ };
++
++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver)
++{
++	pm->pmf = &kfd_vi_pm_funcs;
++}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+index 261657f..34d44ff 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+@@ -63,7 +63,7 @@ MODULE_PARM_DESC(hws_max_conc_proc,
+ 
+ int cwsr_enable = 1;
+ module_param(cwsr_enable, int, 0444);
+-MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))");
++MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+ 
+ int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+ module_param(max_num_of_queues_per_device, int, 0444);
+@@ -75,6 +75,8 @@ module_param(send_sigterm, int, 0444);
+ MODULE_PARM_DESC(send_sigterm,
+ 	"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
+ 
++static int amdkfd_init_completed;
++
+ int debug_largebar;
+ module_param(debug_largebar, int, 0444);
+ MODULE_PARM_DESC(debug_largebar,
+@@ -85,23 +87,16 @@ module_param(ignore_crat, int, 0444);
+ MODULE_PARM_DESC(ignore_crat,
+ 	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
+ 
+-int noretry = 1;
+-module_param(noretry, int, 0644);
++int vega10_noretry = 1;
++module_param_named(noretry, vega10_noretry, int, 0644);
+ MODULE_PARM_DESC(noretry,
+-	"Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled, 1 = retry disabled (default))");
++	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled, 1 = retry disabled (default))");
+ 
+ int priv_cp_queues;
+ module_param(priv_cp_queues, int, 0644);
+ MODULE_PARM_DESC(priv_cp_queues,
+ 	"Enable privileged mode for CP queues (0 = off (default), 1 = on)");
+ 
+-int halt_if_hws_hang;
+-module_param(halt_if_hws_hang, int, 0644);
+-MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)");
+-
+-
+-static int amdkfd_init_completed;
+-
+ int kgd2kfd_init(unsigned int interface_version,
+ 		const struct kgd2kfd_calls **g2f)
+ {
+@@ -154,7 +149,7 @@ static int __init kfd_module_init(void)
+ 
+ 	err = kfd_ipc_init();
+ 	if (err < 0)
+-		goto err_ipc;
++		goto err_topology;
+ 
+ 	err = kfd_process_create_wq();
+ 	if (err < 0)
+@@ -171,8 +166,6 @@ static int __init kfd_module_init(void)
+ 	return 0;
+ 
+ err_create_wq:
+-err_ipc:
+-	kfd_topology_shutdown();
+ err_topology:
+ 	kfd_chardev_exit();
+ err_ioctl:
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+index d39e81c..8279b74 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+@@ -81,7 +81,6 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+ 	case CHIP_POLARIS11:
+ 		return mqd_manager_init_vi_tonga(type, dev);
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 	case CHIP_RAVEN:
+ 		return mqd_manager_init_v9(type, dev);
+ 	default:
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+index 336ea9c..dcaeda8 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+@@ -94,8 +94,6 @@ struct mqd_manager {
+ 				  u32 *ctl_stack_used_size,
+ 				  u32 *save_area_used_size);
+ 
+-	bool	(*check_queue_active)(struct queue *q);
+-
+ #if defined(CONFIG_DEBUG_FS)
+ 	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+index 2441834..bd44a23 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+@@ -42,31 +42,6 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+ 	return (struct cik_sdma_rlc_registers *)mqd;
+ }
+ 
+-static bool check_sdma_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	struct cik_sdma_rlc_registers *m = get_sdma_mqd(q->mqd);
+-
+-	rptr = m->sdma_rlc_rb_rptr;
+-	wptr = m->sdma_rlc_rb_wptr;
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-
+-	return (rptr != wptr);
+-}
+-
+-static bool check_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	struct cik_mqd *m = get_mqd(q->mqd);
+-
+-	rptr = m->cp_hqd_pq_rptr;
+-	wptr = m->cp_hqd_pq_wptr;
+-
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-
+-	return (rptr != wptr);
+-}
+-
+ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ 			struct queue_properties *q)
+ {
+@@ -516,7 +491,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -528,7 +502,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_hiq;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -540,7 +513,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_sdma;
+ 		mqd->destroy_mqd = destroy_mqd_sdma;
+ 		mqd->is_occupied = is_occupied_sdma;
+-		mqd->check_queue_active = check_sdma_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+index dcd24c4..f4e8efc 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2018 Advanced Micro Devices, Inc.
++ * Copyright 2016 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -41,49 +41,6 @@ static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
+ 	return (struct v9_sdma_mqd *)mqd;
+ }
+ 
+-static bool check_sdma_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	uint32_t rptr_hi, wptr_hi;
+-	struct v9_sdma_mqd *m = get_sdma_mqd(q->mqd);
+-
+-	rptr = m->sdmax_rlcx_rb_rptr;
+-	wptr = m->sdmax_rlcx_rb_wptr;
+-	rptr_hi = m->sdmax_rlcx_rb_rptr_hi;
+-	wptr_hi = m->sdmax_rlcx_rb_wptr_hi;
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-	pr_debug("rptr_hi=%d, wptr_hi=%d\n", rptr_hi, wptr_hi);
+-
+-	return (rptr != wptr || rptr_hi != wptr_hi);
+-}
+-
+-static bool check_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	uint32_t cntl_stack_offset, cntl_stack_size;
+-	struct v9_mqd *m = get_mqd(q->mqd);
+-
+-	rptr = m->cp_hqd_pq_rptr;
+-	wptr = m->cp_hqd_pq_wptr_lo % q->properties.queue_size;
+-	cntl_stack_offset = m->cp_hqd_cntl_stack_offset;
+-	cntl_stack_size = m->cp_hqd_cntl_stack_size;
+-
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-	pr_debug("m->cp_hqd_cntl_stack_offset=0x%08x\n", cntl_stack_offset);
+-	pr_debug("m->cp_hqd_cntl_stack_size=0x%08x\n", cntl_stack_size);
+-
+-	if ((rptr == 0 && wptr == 0) ||
+-		cntl_stack_offset == 0xffffffff ||
+-		cntl_stack_size > 0x5000)
+-		return false;
+-
+-	/* Process is idle if both conditions are meet:
+-	 * queue's rptr equals to wptr
+-	 * control stack is empty, cntl_stack_offset = cntl_stack_size
+-	 */
+-	return (rptr != wptr || cntl_stack_offset != cntl_stack_size);
+-}
+-
+ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ 			struct queue_properties *q)
+ {
+@@ -158,7 +115,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
+ 			&((*mqd_mem_obj)->gtt_mem),
+ 			&((*mqd_mem_obj)->gpu_addr),
+-			(void *)&((*mqd_mem_obj)->cpu_ptr), true);
++			(void *)&((*mqd_mem_obj)->cpu_ptr));
+ 	} else
+ 		retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
+ 				mqd_mem_obj);
+@@ -202,7 +159,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+ 	}
+ 
+-	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
++	if (mm->dev->cwsr_enabled) {
+ 		m->cp_hqd_persistent_state |=
+ 			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ 		m->cp_hqd_ctx_save_base_addr_lo =
+@@ -260,9 +217,8 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
+ 	pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+ 			m->cp_hqd_pq_doorbell_control);
+ 
+-	m->cp_hqd_ib_control =
+-		3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
+-		1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
++	m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
++			1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
+ 
+ 	/*
+ 	 * HW does not clamp this field correctly. Maximum EOP queue size
+@@ -287,13 +243,13 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
+ 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+ 				1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT |
+ 				1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT;
+-		m->cp_hqd_pq_doorbell_control |= 1 <<
+-			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
++		m->cp_hqd_pq_doorbell_control |=
++			1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+ 	}
+ 	if (priv_cp_queues)
+ 		m->cp_hqd_pq_control |=
+ 			1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT;
+-	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
++	if (mm->dev->cwsr_enabled)
+ 		m->cp_hqd_ctx_save_control = 0;
+ 
+ 	update_cu_mask(mm, mqd, q);
+@@ -532,7 +488,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+ 		mqd->get_wave_state = get_wave_state;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -544,7 +499,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_hiq;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -556,7 +510,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_sdma;
+ 		mqd->destroy_mqd = destroy_mqd_sdma;
+ 		mqd->is_occupied = is_occupied_sdma;
+-		mqd->check_queue_active = check_sdma_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+index 246fe6c..eff7580 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+@@ -44,45 +44,6 @@ static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+ 	return (struct vi_sdma_mqd *)mqd;
+ }
+ 
+-static bool check_sdma_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	struct vi_sdma_mqd *m = get_sdma_mqd(q->mqd);
+-
+-	rptr = m->sdmax_rlcx_rb_rptr;
+-	wptr = m->sdmax_rlcx_rb_wptr;
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-
+-	return (rptr != wptr);
+-}
+-
+-static bool check_queue_active(struct queue *q)
+-{
+-	uint32_t rptr, wptr;
+-	uint32_t cntl_stack_offset, cntl_stack_size;
+-	struct vi_mqd *m = get_mqd(q->mqd);
+-
+-	rptr = m->cp_hqd_pq_rptr;
+-	wptr = m->cp_hqd_pq_wptr;
+-	cntl_stack_offset = m->cp_hqd_cntl_stack_offset;
+-	cntl_stack_size = m->cp_hqd_cntl_stack_size;
+-
+-	pr_debug("rptr=%d, wptr=%d\n", rptr, wptr);
+-	pr_debug("m->cp_hqd_cntl_stack_offset=0x%08x\n", cntl_stack_offset);
+-	pr_debug("m->cp_hqd_cntl_stack_size=0x%08x\n", cntl_stack_size);
+-
+-	if ((rptr == 0 && wptr == 0) ||
+-		cntl_stack_offset == 0xffffffff ||
+-		cntl_stack_size > 0x5000)
+-		return false;
+-
+-	/* Process is idle if both conditions are meet:
+-	 * queue's rptr equals to wptr
+-	 * control stack is empty, cntl_stack_offset = cntl_stack_size
+-	 */
+-	return (rptr != wptr || cntl_stack_offset != cntl_stack_size);
+-}
+-
+ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+ 			struct queue_properties *q)
+ {
+@@ -198,7 +159,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+ 	}
+ 
+-	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
++	if (mm->dev->cwsr_enabled) {
+ 		m->cp_hqd_persistent_state |=
+ 			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+ 		m->cp_hqd_ctx_save_base_addr_lo =
+@@ -293,7 +254,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 	if (priv_cp_queues)
+ 		m->cp_hqd_pq_control |=
+ 			1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT;
+-	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
++	if (mm->dev->cwsr_enabled)
+ 		m->cp_hqd_ctx_save_control =
+ 			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+ 			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+@@ -537,7 +498,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+ 		mqd->get_wave_state = get_wave_state;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -549,7 +509,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_hiq;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-		mqd->check_queue_active = check_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd;
+ #endif
+@@ -561,7 +520,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_sdma;
+ 		mqd->destroy_mqd = destroy_mqd_sdma;
+ 		mqd->is_occupied = is_occupied_sdma;
+-		mqd->check_queue_active = check_sdma_queue_active;
+ #if defined(CONFIG_DEBUG_FS)
+ 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+ #endif
+@@ -586,3 +544,4 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_tonga;
+ 	return mqd;
+ }
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index c6080ed3..98c89d2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -26,6 +26,7 @@
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_kernel_queue.h"
+ #include "kfd_priv.h"
++#include "kfd_pm4_opcodes.h"
+ 
+ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
+ 				unsigned int buffer_size_bytes)
+@@ -44,7 +45,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 	unsigned int process_count, queue_count, compute_queue_count;
+ 	unsigned int map_queue_size;
+ 	unsigned int max_proc_per_quantum = 1;
+-	struct kfd_dev *dev = pm->dqm->dev;
++
++	struct kfd_dev	*dev = pm->dqm->dev;
+ 
+ 	process_count = pm->dqm->processes_count;
+ 	queue_count = pm->dqm->queue_count;
+@@ -55,20 +57,21 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 	 * hws_max_conc_proc has been done in
+ 	 * kgd2kfd_device_init().
+ 	 */
++
+ 	*over_subscription = false;
+ 
+ 	if (dev->max_proc_per_quantum > 1)
+ 		max_proc_per_quantum = dev->max_proc_per_quantum;
+ 
+ 	if ((process_count > max_proc_per_quantum) ||
+-	    compute_queue_count > get_queues_num(pm->dqm)) {
++		compute_queue_count > get_queues_num(pm->dqm)) {
+ 		*over_subscription = true;
+ 		pr_debug("Over subscribed runlist\n");
+ 	}
+ 
+-	map_queue_size = pm->pmf->map_queues_size;
++	map_queue_size = pm->pmf->get_map_queues_packet_size();
+ 	/* calculate run list ib allocation size */
+-	*rlib_size = process_count * pm->pmf->map_process_size +
++	*rlib_size = process_count * pm->pmf->get_map_process_packet_size() +
+ 		     queue_count * map_queue_size;
+ 
+ 	/*
+@@ -76,7 +79,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 	 * when over subscription
+ 	 */
+ 	if (*over_subscription)
+-		*rlib_size += pm->pmf->runlist_size;
++		*rlib_size += pm->pmf->get_runlist_packet_size();
+ 
+ 	pr_debug("runlist ib size %d\n", *rlib_size);
+ }
+@@ -157,7 +160,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			return retval;
+ 
+ 		proccesses_mapped++;
+-		inc_wptr(&rl_wptr, pm->pmf->map_process_size,
++		inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(),
+ 				alloc_size_bytes);
+ 
+ 		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
+@@ -175,7 +178,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 				return retval;
+ 
+ 			inc_wptr(&rl_wptr,
+-				pm->pmf->map_queues_size,
++				pm->pmf->get_map_queues_packet_size(),
+ 				alloc_size_bytes);
+ 		}
+ 
+@@ -190,12 +193,11 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 						&rl_buffer[rl_wptr],
+ 						q,
+ 						qpd->is_debug);
+-
+ 			if (retval)
+ 				return retval;
+ 
+ 			inc_wptr(&rl_wptr,
+-				pm->pmf->map_queues_size,
++				pm->pmf->get_map_queues_packet_size(),
+ 				alloc_size_bytes);
+ 		}
+ 	}
+@@ -215,38 +217,37 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 	return retval;
+ }
+ 
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
++		uint16_t fw_ver)
+ {
+-	switch (dqm->dev->device_info->asic_family) {
++	pm->dqm = dqm;
++	mutex_init(&pm->lock);
++	pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
++	if (!pm->priv_queue) {
++		mutex_destroy(&pm->lock);
++		return -ENOMEM;
++	}
++	pm->allocated = false;
++
++	switch (pm->dqm->dev->device_info->asic_family) {
+ 	case CHIP_KAVERI:
+ 	case CHIP_HAWAII:
+-		/* PM4 packet structures on CIK are the same as on VI */
++		kfd_pm_func_init_cik(pm, fw_ver);
++		break;
+ 	case CHIP_CARRIZO:
+ 	case CHIP_TONGA:
+ 	case CHIP_FIJI:
+ 	case CHIP_POLARIS10:
+ 	case CHIP_POLARIS11:
+-		pm->pmf = &kfd_vi_pm_funcs;
++		kfd_pm_func_init_vi(pm, fw_ver);
+ 		break;
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 	case CHIP_RAVEN:
+-		pm->pmf = &kfd_v9_pm_funcs;
++		kfd_pm_func_init_v9(pm, fw_ver);
+ 		break;
+ 	default:
+-		WARN(1, "Unexpected ASIC family %u",
+-		     dqm->dev->device_info->asic_family);
+-		return -EINVAL;
+-	}
+-
+-	pm->dqm = dqm;
+-	mutex_init(&pm->lock);
+-	pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ);
+-	if (!pm->priv_queue) {
+-		mutex_destroy(&pm->lock);
+-		return -ENOMEM;
++		BUG();
+ 	}
+-	pm->allocated = false;
+ 
+ 	return 0;
+ }
+@@ -263,7 +264,7 @@ int pm_send_set_resources(struct packet_manager *pm,
+ 	uint32_t *buffer, size;
+ 	int retval = 0;
+ 
+-	size = pm->pmf->set_resources_size;
++	size = pm->pmf->get_set_resources_packet_size();
+ 	mutex_lock(&pm->lock);
+ 	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+ 					size / sizeof(uint32_t),
+@@ -300,7 +301,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+ 
+ 	pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
+ 
+-	packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t);
++	packet_size_dwords = pm->pmf->get_runlist_packet_size() /
++		sizeof(uint32_t);
+ 	mutex_lock(&pm->lock);
+ 
+ 	retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+@@ -309,7 +311,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+ 		goto fail_acquire_packet_buffer;
+ 
+ 	retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
+-					rl_ib_size / sizeof(uint32_t), false);
++				rl_ib_size / sizeof(uint32_t), false);
+ 	if (retval)
+ 		goto fail_create_runlist;
+ 
+@@ -337,7 +339,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+ 	if (WARN_ON(!fence_address))
+ 		return -EFAULT;
+ 
+-	size = pm->pmf->query_status_size;
++	size = pm->pmf->get_query_status_packet_size();
+ 	mutex_lock(&pm->lock);
+ 	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+ 			size / sizeof(uint32_t), (unsigned int **)&buffer);
+@@ -366,7 +368,7 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+ 	uint32_t *buffer, size;
+ 	int retval = 0;
+ 
+-	size = pm->pmf->unmap_queues_size;
++	size = pm->pmf->get_unmap_queues_packet_size();
+ 	mutex_lock(&pm->lock);
+ 	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+ 			size / sizeof(uint32_t), (unsigned int **)&buffer);
+@@ -398,51 +400,17 @@ void pm_release_ib(struct packet_manager *pm)
+ 	mutex_unlock(&pm->lock);
+ }
+ 
+-#if defined(CONFIG_DEBUG_FS)
+-
+ int pm_debugfs_runlist(struct seq_file *m, void *data)
+ {
+ 	struct packet_manager *pm = data;
+ 
+-	mutex_lock(&pm->lock);
+-
+ 	if (!pm->allocated) {
+ 		seq_puts(m, "  No active runlist\n");
+-		goto out;
++		return 0;
+ 	}
+ 
+ 	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
+ 		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
+ 
+-out:
+-	mutex_unlock(&pm->lock);
+ 	return 0;
+ }
+-
+-int pm_debugfs_hang_hws(struct packet_manager *pm)
+-{
+-	uint32_t *buffer, size;
+-	int r = 0;
+-
+-	size = pm->pmf->query_status_size;
+-	mutex_lock(&pm->lock);
+-	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+-			size / sizeof(uint32_t), (unsigned int **)&buffer);
+-	if (!buffer) {
+-		pr_err("Failed to allocate buffer on kernel queue\n");
+-		r = -ENOMEM;
+-		goto out;
+-	}
+-	memset(buffer, 0x55, size);
+-	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+-
+-	pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.",
+-		buffer[0], buffer[1], buffer[2], buffer[3],
+-		buffer[4], buffer[5], buffer[6]);
+-out:
+-	mutex_unlock(&pm->lock);
+-	return r;
+-}
+-
+-
+-#endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
+index 87344cc..fae8e8c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
+@@ -49,9 +49,9 @@
+ #include <linux/slab.h>
+ #include <linux/scatterlist.h>
+ #include <linux/module.h>
+-#include <drm/amd_rdma.h>
+ 
+ #include "kfd_priv.h"
++#include "amd_rdma.h"
+ 
+ 
+ 
+@@ -137,6 +137,7 @@ static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle);
+ 
+ static const struct amd_rdma_interface *rdma_interface;
+ 
++static invalidate_peer_memory ib_invalidate_callback;
+ static void *ib_reg_handle;
+ 
+ struct amd_mem_context {
+@@ -168,6 +169,9 @@ static void free_callback(void *client_priv)
+ 
+ 	pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context);
+ 
++	/* Call back IB stack asking to invalidate memory */
++	(*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context);
++
+ 	/* amdkfd will free resources when we return from this callback.
+ 	 * Set flag to inform that there is nothing to do on "put_pages", etc.
+ 	 */
+@@ -474,7 +478,7 @@ void kfd_init_peer_direct(void)
+ 	strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION);
+ 
+ 	ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client,
+-							   NULL);
++						&ib_invalidate_callback);
+ 
+ 	if (!ib_reg_handle) {
+ 		pr_err("Cannot register peer memory client\n");
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 7869a9d..b2ef0f5 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -30,15 +30,16 @@
+ #include <linux/atomic.h>
+ #include <linux/workqueue.h>
+ #include <linux/spinlock.h>
+-#include <linux/kfd_ioctl.h>
+ #include <linux/idr.h>
++#include <linux/kfd_ioctl.h>
++#include <linux/pid.h>
++#include <linux/interval_tree.h>
+ #include <linux/seq_file.h>
+ #include <linux/kref.h>
+ #include <linux/kfifo.h>
+-#include <linux/pid.h>
+-#include <linux/interval_tree.h>
+ #include <kgd_kfd_interface.h>
+ 
++#include "amd_rdma.h"
+ #include "amd_shared.h"
+ 
+ #define KFD_SYSFS_FILE_MODE 0444
+@@ -49,7 +50,8 @@
+ /* Use upper bits of mmap offset to store KFD driver specific information.
+  * BITS[63:62] - Encode MMAP type
+  * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
+- * BITS[45:0]  - MMAP offset value
++ * BITS[45:40] - Reserved. Not Used.
++ * BITS[39:0]  - MMAP offset value. Used by TTM.
+  *
+  * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+  *  defines are w.r.t to PAGE_SIZE
+@@ -68,7 +70,7 @@
+ #define KFD_MMAP_GPU_ID_GET(offset)    ((offset & KFD_MMAP_GPU_ID_MASK) \
+ 				>> KFD_MMAP_GPU_ID_SHIFT)
+ 
+-#define KFD_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFULL >> PAGE_SHIFT)
++#define KFD_MMAP_OFFSET_VALUE_MASK	(0xFFFFFFFFFFULL >> PAGE_SHIFT)
+ #define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
+ 
+ /*
+@@ -81,6 +83,7 @@
+ #define KFD_CIK_HIQ_PIPE 4
+ #define KFD_CIK_HIQ_QUEUE 0
+ 
++
+ /* Macro for allocating structures */
+ #define kfd_alloc_struct(ptr_to_struct)	\
+ 	((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL))
+@@ -113,14 +116,14 @@ extern int max_num_of_queues_per_device;
+ /* Kernel module parameter to specify the scheduling policy */
+ extern int sched_policy;
+ 
++extern int cwsr_enable;
++
+ /*
+  * Kernel module parameter to specify the maximum process
+  * number per HW scheduler
+  */
+ extern int hws_max_conc_proc;
+ 
+-extern int cwsr_enable;
+-
+ /*
+  * Kernel module parameter to specify whether to send sigterm to HSA process on
+  * unhandled exception
+@@ -142,18 +145,13 @@ extern int ignore_crat;
+ /*
+  * Set sh_mem_config.retry_disable on Vega10
+  */
+-extern int noretry;
++extern int vega10_noretry;
+ 
+ /*
+  * Enable privileged mode for all CP queues including user queues
+  */
+ extern int priv_cp_queues;
+ 
+-/*
+- * Halt if HWS hang is detected
+- */
+-extern int halt_if_hws_hang;
+-
+ /**
+  * enum kfd_sched_policy
+  *
+@@ -210,7 +208,6 @@ struct kfd_device_info {
+ 	bool needs_pci_atomics;
+ 	/* obtain from adev->sdma.num_instances */
+ 	unsigned int num_sdma_engines;
+-	unsigned int num_sdma_queues_per_engine;
+ };
+ 
+ struct kfd_mem_obj {
+@@ -294,8 +291,6 @@ struct kfd_dev {
+ 	bool cwsr_enabled;
+ 	const void *cwsr_isa;
+ 	unsigned int cwsr_isa_size;
+-
+-	bool pci_atomic_requested;
+ };
+ 
+ struct kfd_ipc_obj;
+@@ -306,41 +301,6 @@ struct kfd_bo {
+ 	struct kfd_dev *dev;
+ 	struct list_head cb_data_head;
+ 	struct kfd_ipc_obj *kfd_ipc_obj;
+-	/* page-aligned VA address */
+-	uint64_t cpuva;
+-	unsigned int mem_type;
+-};
+-
+-struct cma_system_bo {
+-	struct kgd_mem *mem;
+-	struct sg_table *sg;
+-	struct kfd_dev *dev;
+-	struct list_head list;
+-};
+-
+-/* Similar to iov_iter */
+-struct cma_iter {
+-	/* points to current entry of range array */
+-	struct kfd_memory_range *array;
+-	/* total number of entries in the initial array */
+-	unsigned long nr_segs;
+-	/* total amount of data pointed by kfd array*/
+-	unsigned long total;
+-	/* offset into the entry pointed by cma_iter.array */
+-	unsigned long offset;
+-	struct kfd_process *p;
+-	struct mm_struct *mm;
+-	struct task_struct *task;
+-	/* current kfd_bo associated with cma_iter.array.va_addr */
+-	struct kfd_bo *cur_bo;
+-	/* offset w.r.t cur_bo */
+-	unsigned long bo_offset;
+-	/* If cur_bo is a userptr BO, then a shadow system BO is created
+-	 * using its underlying pages. cma_bo holds this BO. cma_list is a
+-	 * list cma_bos created in one session
+-	 */
+-	struct cma_system_bo *cma_bo;
+-	struct list_head cma_list;
+ };
+ 
+ /* KGD2KFD callbacks */
+@@ -444,11 +404,7 @@ enum KFD_QUEUE_PRIORITY {
+  * @is_interop: Defines if this is a interop queue. Interop queue means that
+  * the queue can access both graphics and compute resources.
+  *
+- * @is_evicted: Defines if the queue is evicted. Only active queues
+- * are evicted, rendering them inactive.
+- *
+- * @is_active: Defines if the queue is active or not. @is_active and
+- * @is_evicted are protected by the DQM lock.
++ * @is_active: Defines if the queue is active or not.
+  *
+  * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid
+  * of the queue.
+@@ -470,7 +426,7 @@ struct queue_properties {
+ 	void __iomem *doorbell_ptr;
+ 	uint32_t doorbell_off;
+ 	bool is_interop;
+-	bool is_evicted;
++	bool is_evicted; /* true -> queue is evicted */
+ 	bool is_active;
+ 	/* Not relevant for user mode queues in cp scheduling */
+ 	unsigned int vmid;
+@@ -589,6 +545,7 @@ struct qcm_process_device {
+ 	struct list_head priv_queue_list;
+ 
+ 	unsigned int queue_count;
++	/* a data field only meaningful for non-HWS case */
+ 	unsigned int vmid;
+ 	bool is_debug;
+ 	unsigned int evicted; /* eviction counter, 0=active */
+@@ -602,11 +559,11 @@ struct qcm_process_device {
+ 	 * All the memory management data should be here too
+ 	 */
+ 	uint64_t gds_context_area;
+-	uint64_t page_table_base;
+ 	uint32_t sh_mem_config;
+ 	uint32_t sh_mem_bases;
+ 	uint32_t sh_mem_ape1_base;
+ 	uint32_t sh_mem_ape1_limit;
++	uint32_t page_table_base;
+ 	uint32_t gds_size;
+ 	uint32_t num_gws;
+ 	uint32_t num_oac;
+@@ -619,11 +576,11 @@ struct qcm_process_device {
+ 	uint64_t tma_addr;
+ 
+ 	/* IB memory */
+-	uint64_t ib_base;
++	uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */
+ 	void *ib_kaddr;
+ 
+ 	/*doorbell resources per process per device*/
+-	unsigned long *doorbell_bitmap;
++	unsigned long           *doorbell_bitmap;
+ };
+ 
+ /* KFD Memory Eviction */
+@@ -635,10 +592,11 @@ struct qcm_process_device {
+ /* Approx. time before evicting the process again */
+ #define PROCESS_ACTIVE_TIME_MS 10
+ 
+-int kgd2kfd_quiesce_mm(struct mm_struct *mm);
+-int kgd2kfd_resume_mm(struct mm_struct *mm);
+ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+ 					       struct dma_fence *fence);
++int kfd_process_evict_queues(struct kfd_process *p);
++int kfd_process_restore_queues(struct kfd_process *p);
++
+ 
+ /* 8 byte handle containing GPU ID in the most significant 4 bytes and
+  * idr_handle in the least significant 4 bytes
+@@ -754,14 +712,13 @@ struct kfd_process {
+ 	struct idr event_idr;
+ 	/* Event page */
+ 	struct kfd_signal_page *signal_page;
+-	size_t signal_mapped_size;
+ 	size_t signal_event_count;
+ 	bool signal_event_limit_reached;
+ 
+ 	struct rb_root_cached bo_interval_tree;
+ 
+ 	/* Information used for memory eviction */
+-	void *kgd_process_info;
++	void *process_info;
+ 	/* Eviction fence that is attached to all the BOs of this process. The
+ 	 * fence will be triggered during eviction and new one will be created
+ 	 * during restore
+@@ -804,32 +761,29 @@ struct amdkfd_ioctl_desc {
+ int kfd_process_create_wq(void);
+ void kfd_process_destroy_wq(void);
+ struct kfd_process *kfd_create_process(struct file *filep);
+-struct kfd_process *kfd_get_process(const struct task_struct *);
++struct kfd_process *kfd_get_process(const struct task_struct *task);
+ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
+ void kfd_unref_process(struct kfd_process *p);
+-int kfd_process_evict_queues(struct kfd_process *p);
+-int kfd_process_restore_queues(struct kfd_process *p);
+ void kfd_suspend_all_processes(void);
+ int kfd_resume_all_processes(void);
+ 
+ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
+ 			       struct file *drm_file);
+ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+-						struct kfd_process *p);
++							struct kfd_process *p);
+ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+ 
+-int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
+-			  struct vm_area_struct *vma);
++int kfd_reserved_mem_mmap(struct kfd_process *process,
++		struct vm_area_struct *vma);
+ 
+ /* KFD process API for creating and translating handles */
+ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+ 					void *mem, uint64_t start,
+-					uint64_t length, uint64_t cpuva,
+-					unsigned int mem_type,
++					uint64_t length,
+ 					struct kfd_ipc_obj *ipc_obj);
+ void *kfd_process_device_translate_handle(struct kfd_process_device *p,
+ 					int handle);
+@@ -864,7 +818,7 @@ void kfd_pasid_free(unsigned int pasid);
+ size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
+ int kfd_doorbell_init(struct kfd_dev *kfd);
+ void kfd_doorbell_fini(struct kfd_dev *kfd);
+-int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
++int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process,
+ 		      struct vm_area_struct *vma);
+ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+ 					unsigned int *doorbell_off);
+@@ -921,6 +875,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd);
+ 
+ /* amdkfd Apertures */
+ int kfd_init_apertures(struct kfd_process *process);
++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
++				uint64_t base, uint64_t limit);
+ 
+ /* Queue Context Management */
+ int init_queue(struct queue **q, const struct queue_properties *properties);
+@@ -975,6 +931,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
+ 		       void __user *ctl_stack,
+ 		       u32 *ctl_stack_used_size,
+ 		       u32 *save_area_used_size);
++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm);
++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm);
+ 
+ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 				unsigned int fence_value,
+@@ -985,6 +943,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ #define KFD_FENCE_COMPLETED (100)
+ #define KFD_FENCE_INIT   (10)
+ 
++struct packet_manager_func;
++
+ struct packet_manager {
+ 	struct device_queue_manager *dqm;
+ 	struct kernel_queue *priv_queue;
+@@ -993,11 +953,11 @@ struct packet_manager {
+ 	struct kfd_mem_obj *ib_buffer_obj;
+ 	unsigned int ib_size_bytes;
+ 
+-	const struct packet_manager_funcs *pmf;
++	struct packet_manager_funcs *pmf;
+ };
+ 
+ struct packet_manager_funcs {
+-	/* Support ASIC-specific packet formats for PM4 packets */
++	/* Support different firmware versions for PM4  packets */
+ 	int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
+ 			struct qcm_process_device *qpd);
+ 	int (*runlist)(struct packet_manager *pm, uint32_t *buffer,
+@@ -1013,22 +973,20 @@ struct packet_manager_funcs {
+ 			unsigned int sdma_engine);
+ 	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
+ 			uint64_t fence_address,	uint32_t fence_value);
+-	int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
+-
+-	/* Packet sizes */
+-	int map_process_size;
+-	int runlist_size;
+-	int set_resources_size;
+-	int map_queues_size;
+-	int unmap_queues_size;
+-	int query_status_size;
+-	int release_mem_size;
+-};
++	uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
+ 
+-extern const struct packet_manager_funcs kfd_vi_pm_funcs;
+-extern const struct packet_manager_funcs kfd_v9_pm_funcs;
++	uint32_t (*get_map_process_packet_size)(void);
++	uint32_t (*get_runlist_packet_size)(void);
++	uint32_t (*get_set_resources_packet_size)(void);
++	uint32_t (*get_map_queues_packet_size)(void);
++	uint32_t (*get_unmap_queues_packet_size)(void);
++	uint32_t (*get_query_status_packet_size)(void);
++	uint32_t (*get_release_mem_packet_size)(void);
+ 
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
++};
++
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
++		uint16_t fw_ver);
+ void pm_uninit(struct packet_manager *pm);
+ int pm_send_set_resources(struct packet_manager *pm,
+ 				struct scheduling_resources *res);
+@@ -1043,10 +1001,37 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+ 
+ void pm_release_ib(struct packet_manager *pm);
+ 
+-/* Following PM funcs can be shared among VI and AI */
++/* Following  PM funcs can be shared among CIK and VI  */
+ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
++			uint64_t ib, size_t ib_size_in_dwords, bool chain);
++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
++				struct queue *q, bool is_static);
+ int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+ 				struct scheduling_resources *res);
++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
++			enum kfd_queue_type type,
++			enum kfd_unmap_queues_filter filter,
++			uint32_t filter_param, bool reset,
++			unsigned int sdma_engine);
++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
++			uint64_t fence_address,	uint32_t fence_value);
++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer);
++
++uint32_t pm_get_map_process_packet_size_vi(void);
++uint32_t pm_get_runlist_packet_size_vi(void);
++uint32_t pm_get_set_resources_packet_size_vi(void);
++uint32_t pm_get_map_queues_packet_size_vi(void);
++uint32_t pm_get_unmap_queues_packet_size_vi(void);
++uint32_t pm_get_query_status_packet_size_vi(void);
++uint32_t pm_get_release_mem_packet_size_vi(void);
++
++
++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver);
++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver);
++
++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver);
++
+ 
+ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
+ 
+@@ -1071,24 +1056,21 @@ void kfd_signal_iommu_event(struct kfd_dev *dev,
+ void kfd_signal_hw_exception_event(unsigned int pasid);
+ int kfd_set_event(struct kfd_process *p, uint32_t event_id);
+ int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
+-int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
+-		       uint64_t size);
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+ 		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index);
++		     uint64_t *event_page_offset, uint32_t *event_slot_index,
++		     void *kern_addr);
+ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+ 
+ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+ 				struct kfd_vm_fault_info *info);
+ 
+-void kfd_signal_reset_event(struct kfd_dev *dev);
+-
+ void kfd_flush_tlb(struct kfd_process_device *pdd);
+ 
+ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
+ 
+-bool kfd_is_locked(void);
++#define KFD_SCRATCH_KV_FW_VER 413
+ 
+ /* PeerDirect support */
+ void kfd_init_peer_direct(void);
+@@ -1109,10 +1091,6 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data);
+ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
+ int pm_debugfs_runlist(struct seq_file *m, void *data);
+ 
+-int kfd_debugfs_hang_hws(struct kfd_dev *dev);
+-int pm_debugfs_hang_hws(struct packet_manager *pm);
+-int dqm_debugfs_execute_queues(struct device_queue_manager *dqm);
+-
+ #else
+ 
+ static inline void kfd_debugfs_init(void) {}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index da67302..c627b63 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -30,7 +30,6 @@
+ #include <linux/notifier.h>
+ #include <linux/compat.h>
+ #include <linux/mman.h>
+-#include <linux/file.h>
+ #include <asm/page.h>
+ #include "kfd_ipc.h"
+ 
+@@ -61,6 +60,9 @@ static struct workqueue_struct *kfd_process_wq;
+  */
+ static struct workqueue_struct *kfd_restore_wq;
+ 
++#define MIN_IDR_ID 1
++#define MAX_IDR_ID 0 /*0 - for unlimited*/
++
+ static struct kfd_process *find_process(const struct task_struct *thread,
+ 		bool ref);
+ static void kfd_process_ref_release(struct kref *ref);
+@@ -78,12 +80,7 @@ int kfd_process_create_wq(void)
+ 	if (!kfd_restore_wq)
+ 		kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+ 
+-	if (!kfd_process_wq || !kfd_restore_wq) {
+-		kfd_process_destroy_wq();
+-		return -ENOMEM;
+-	}
+-
+-	return 0;
++	return kfd_process_wq && kfd_restore_wq ? 0 : -ENOMEM;
+ }
+ 
+ void kfd_process_destroy_wq(void)
+@@ -121,11 +118,9 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
+ 	struct kgd_mem *mem = NULL;
+ 	int handle;
+ 	int err;
+-	unsigned int mem_type;
+ 
+ 	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size,
+-						 pdd->vm, NULL, &mem, NULL,
+-						 flags);
++						 pdd->vm, &mem, NULL, flags);
+ 	if (err)
+ 		goto err_alloc_mem;
+ 
+@@ -139,18 +134,13 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
+ 		goto sync_memory_failed;
+ 	}
+ 
+-	mem_type = flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_GTT |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR |
+-			    KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL);
+-
+ 	/* Create an obj handle so kfd_process_device_remove_obj_handle
+ 	 * will take care of the bo removal when the process finishes.
+ 	 * We do not need to take p->mutex, because the process is just
+ 	 * created and the ioctls have not had the chance to run.
+ 	 */
+ 	handle = kfd_process_device_create_obj_handle(
+-			pdd, mem, gpu_va, size, 0, mem_type, NULL);
++			pdd, mem, gpu_va, size, NULL);
+ 
+ 	if (handle < 0) {
+ 		err = handle;
+@@ -185,16 +175,14 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd,
+ /* kfd_process_device_reserve_ib_mem - Reserve memory inside the
+  *	process for IB usage The memory reserved is for KFD to submit
+  *	IB to AMDGPU from kernel.  If the memory is reserved
+- *	successfully, ib_kaddr will have the CPU/kernel
+- *	address. Check ib_kaddr before accessing the memory.
++ *	successfully, ib_kaddr_assigned will have the CPU/kernel
++ *	address. Check ib_kaddr_assigned before accessing the memory.
+  */
+ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd)
+ {
+ 	struct qcm_process_device *qpd = &pdd->qpd;
+-	uint32_t flags = ALLOC_MEM_FLAGS_GTT |
+-			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE |
+-			 ALLOC_MEM_FLAGS_WRITABLE |
+-			 ALLOC_MEM_FLAGS_EXECUTABLE;
++	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
++		ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
+ 	void *kaddr;
+ 	int ret;
+ 
+@@ -215,6 +203,7 @@ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd)
+ struct kfd_process *kfd_create_process(struct file *filep)
+ {
+ 	struct kfd_process *process;
++
+ 	struct task_struct *thread = current;
+ 
+ 	if (!thread->mm)
+@@ -255,8 +244,6 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread)
+ 		return ERR_PTR(-EINVAL);
+ 
+ 	process = find_process(thread, false);
+-	if (!process)
+-		return ERR_PTR(-EINVAL);
+ 
+ 	return process;
+ }
+@@ -352,9 +339,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
+ 
+ 	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+ 				 per_device_list) {
+-		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
+-				pdd->dev->id, p->pasid);
+-
++		/* Destroy the GPUVM VM context */
+ 		if (pdd->drm_file)
+ 			fput(pdd->drm_file);
+ 		else if (pdd->vm)
+@@ -407,6 +392,9 @@ static void kfd_process_ref_release(struct kref *ref)
+ {
+ 	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
+ 
++	if (WARN_ON(!kfd_process_wq))
++		return;
++
+ 	INIT_WORK(&p->release_work, kfd_process_wq_release);
+ 	queue_work(kfd_process_wq, &p->release_work);
+ }
+@@ -487,19 +475,17 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
+ 		if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base)
+ 			continue;
+ 
+-		offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id))
+-			<< PAGE_SHIFT;
+-		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
+-			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
+-			MAP_SHARED, offset);
++		offset = (dev->id | KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT;
++		qpd->tba_addr = (uint64_t)vm_mmap(filep, 0,
++				KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
++				MAP_SHARED, offset);
+ 
+ 		if (IS_ERR_VALUE(qpd->tba_addr)) {
+-			int err = qpd->tba_addr;
+-
+-			pr_err("Failure to set tba address. error %d.\n", err);
++			pr_err("Failure to set tba address. error -%d.\n",
++			       (int)qpd->tba_addr);
+ 			qpd->tba_addr = 0;
+ 			qpd->cwsr_kaddr = NULL;
+-			return err;
++			return -ENOMEM;
+ 		}
+ 
+ 		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+@@ -516,8 +502,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
+ {
+ 	struct kfd_dev *dev = pdd->dev;
+ 	struct qcm_process_device *qpd = &pdd->qpd;
+-	uint32_t flags = ALLOC_MEM_FLAGS_GTT |
+-		ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTABLE;
++	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
++		ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_READONLY |
++		ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
+ 	void *kaddr;
+ 	int ret;
+ 
+@@ -675,12 +662,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 	if (!pdd)
+ 		return NULL;
+ 
+-	if (init_doorbell_bitmap(&pdd->qpd, dev)) {
+-		pr_err("Failed to init doorbell for process\n");
+-		kfree(pdd);
+-		return NULL;
+-	}
+-
+ 	pdd->dev = dev;
+ 	INIT_LIST_HEAD(&pdd->qpd.queues_list);
+ 	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
+@@ -694,8 +675,19 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 
+ 	/* Init idr used for memory handle translation */
+ 	idr_init(&pdd->alloc_idr);
++	if (init_doorbell_bitmap(&pdd->qpd, dev)) {
++		pr_err("Failed to init doorbell for process\n");
++		goto err_create_pdd;
++	}
+ 
+ 	return pdd;
++
++err_create_pdd:
++	kfree(pdd->qpd.doorbell_bitmap);
++	idr_destroy(&pdd->alloc_idr);
++	list_del(&pdd->per_device_list);
++	kfree(pdd);
++	return NULL;
+ }
+ 
+ /**
+@@ -720,18 +712,17 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
+ 	int ret;
+ 
+ 	if (pdd->vm)
+-		return drm_file ? -EBUSY : 0;
++		return 0;
+ 
+ 	p = pdd->process;
+ 	dev = pdd->dev;
+ 
+ 	if (drm_file)
+ 		ret = dev->kfd2kgd->acquire_process_vm(
+-			dev->kgd, drm_file, p->pasid,
+-			&pdd->vm, &p->kgd_process_info, &p->ef);
++			dev->kgd, drm_file, &pdd->vm, &p->process_info, &p->ef);
+ 	else
+ 		ret = dev->kfd2kgd->create_process_vm(
+-			dev->kgd, p->pasid, &pdd->vm, &p->kgd_process_info, &p->ef);
++			dev->kgd, &pdd->vm, &p->process_info, &p->ef);
+ 	if (ret) {
+ 		pr_err("Failed to create process VM object\n");
+ 		return ret;
+@@ -815,8 +806,7 @@ bool kfd_has_process_device_data(struct kfd_process *p)
+  */
+ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+ 					void *mem, uint64_t start,
+-					uint64_t length, uint64_t cpuva,
+-					unsigned int mem_type,
++					uint64_t length,
+ 					struct kfd_ipc_obj *ipc_obj)
+ {
+ 	int handle;
+@@ -837,12 +827,15 @@ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+ 	buf_obj->mem = mem;
+ 	buf_obj->dev = pdd->dev;
+ 	buf_obj->kfd_ipc_obj = ipc_obj;
+-	buf_obj->cpuva = cpuva;
+-	buf_obj->mem_type = mem_type;
+ 
+ 	INIT_LIST_HEAD(&buf_obj->cb_data_head);
+ 
+-	handle = idr_alloc(&pdd->alloc_idr, buf_obj, 0, 0, GFP_KERNEL);
++	idr_preload(GFP_KERNEL);
++
++	handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID,
++			GFP_NOWAIT);
++
++	idr_preload_end();
+ 
+ 	if (handle < 0)
+ 		kfree(buf_obj);
+@@ -945,6 +938,42 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+ 	return ret_p;
+ }
+ 
++void kfd_suspend_all_processes(void)
++{
++	struct kfd_process *p;
++	unsigned int temp;
++	int idx = srcu_read_lock(&kfd_processes_srcu);
++
++	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
++		cancel_delayed_work_sync(&p->eviction_work);
++		cancel_delayed_work_sync(&p->restore_work);
++
++		if (kfd_process_evict_queues(p))
++			pr_err("Failed to suspend process %d\n", p->pasid);
++		dma_fence_signal(p->ef);
++		dma_fence_put(p->ef);
++		p->ef = NULL;
++	}
++	srcu_read_unlock(&kfd_processes_srcu, idx);
++}
++
++int kfd_resume_all_processes(void)
++{
++	struct kfd_process *p;
++	unsigned int temp;
++	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
++
++	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
++		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
++			pr_err("Restore process %d failed during resume\n",
++			       p->pasid);
++			ret = -EFAULT;
++		}
++	}
++	srcu_read_unlock(&kfd_processes_srcu, idx);
++	return ret;
++}
++
+ /* This increments the process->ref counter. */
+ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
+ {
+@@ -1036,14 +1065,15 @@ static void evict_process_worker(struct work_struct *work)
+ 		  "Eviction fence mismatch\n");
+ 
+ 	/* Narrow window of overlap between restore and evict work
+-	 * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
+-	 * unreserves KFD BOs, it is possible to evicted again. But
+-	 * restore has few more steps of finish. So lets wait for any
+-	 * previous restore work to complete
++	 * item is possible. Once
++	 * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs,
++	 * it is possible to evicted again. But restore has few more
++	 * steps of finish. So lets wait for any previous restore work
++	 * to complete
+ 	 */
+ 	flush_delayed_work(&p->restore_work);
+ 
+-	pr_info("Started evicting pasid %d\n", p->pasid);
++	pr_info("Started evicting process of pasid %d\n", p->pasid);
+ 	ret = kfd_process_evict_queues(p);
+ 	if (!ret) {
+ 		dma_fence_signal(p->ef);
+@@ -1052,9 +1082,10 @@ static void evict_process_worker(struct work_struct *work)
+ 		queue_delayed_work(kfd_restore_wq, &p->restore_work,
+ 				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+ 
+-		pr_info("Finished evicting pasid %d\n", p->pasid);
++		pr_info("Finished evicting process of pasid %d\n", p->pasid);
+ 	} else
+-		pr_err("Failed to evict queues of pasid %d\n", p->pasid);
++		pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n",
++		       p->pasid);
+ }
+ 
+ static void restore_process_worker(struct work_struct *work)
+@@ -1080,7 +1111,7 @@ static void restore_process_worker(struct work_struct *work)
+ 			       struct kfd_process_device,
+ 			       per_device_list);
+ 
+-	pr_info("Started restoring pasid %d\n", p->pasid);
++	pr_info("Started restoring process of pasid %d\n", p->pasid);
+ 
+ 	/* Setting last_restore_timestamp before successful restoration.
+ 	 * Otherwise this would have to be set by KGD (restore_process_bos)
+@@ -1093,11 +1124,10 @@ static void restore_process_worker(struct work_struct *work)
+ 	 */
+ 
+ 	p->last_restore_timestamp = get_jiffies_64();
+-	ret = pdd->dev->kfd2kgd->restore_process_bos(p->kgd_process_info,
+-						     &p->ef);
++	ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef);
+ 	if (ret) {
+-		pr_info("Failed to restore BOs of pasid %d, retry after %d ms\n",
+-			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
++		pr_info("Restore failed, try again after %d ms\n",
++			PROCESS_BACK_OFF_TIME_MS);
+ 		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
+ 				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
+ 		WARN(!ret, "reschedule restore work failed\n");
+@@ -1105,54 +1135,21 @@ static void restore_process_worker(struct work_struct *work)
+ 	}
+ 
+ 	ret = kfd_process_restore_queues(p);
+-	if (!ret)
+-		pr_info("Finished restoring pasid %d\n", p->pasid);
+-	else
+-		pr_err("Failed to restore queues of pasid %d\n", p->pasid);
+-}
+-
+-void kfd_suspend_all_processes(void)
+-{
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		cancel_delayed_work_sync(&p->eviction_work);
+-		cancel_delayed_work_sync(&p->restore_work);
+-
+-		if (kfd_process_evict_queues(p))
+-			pr_err("Failed to suspend process %d\n", p->pasid);
+-		dma_fence_signal(p->ef);
+-		dma_fence_put(p->ef);
+-		p->ef = NULL;
+-	}
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-}
+-
+-int kfd_resume_all_processes(void)
+-{
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
++	if (ret)
++		pr_err("Failed to resume user queues\n");
+ 
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
+-			pr_err("Restore process %d failed during resume\n",
+-			       p->pasid);
+-			ret = -EFAULT;
+-		}
+-	}
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-	return ret;
++	pr_info("Finished restoring process of pasid %d\n", p->pasid);
+ }
+ 
+-int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
+-			  struct vm_area_struct *vma)
++int kfd_reserved_mem_mmap(struct kfd_process *process,
++		struct vm_area_struct *vma)
+ {
++	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
+ 	struct kfd_process_device *pdd;
+ 	struct qcm_process_device *qpd;
+ 
++	if (!dev)
++		return -EINVAL;
+ 	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
+ 		pr_err("Incorrect CWSR mapping size.\n");
+ 		return -EINVAL;
+@@ -1178,6 +1175,7 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process,
+ 			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
+ }
+ 
++
+ void kfd_flush_tlb(struct kfd_process_device *pdd)
+ {
+ 	struct kfd_dev *dev = pdd->dev;
+@@ -1212,7 +1210,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
+ 		r = pqm_debugfs_mqds(m, &p->pqm);
+ 		mutex_unlock(&p->mutex);
+ 
+-		if (r)
++		if (r != 0)
+ 			break;
+ 	}
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+index 8933323..52882e0 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+@@ -188,7 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	case KFD_QUEUE_TYPE_SDMA:
+ 		if (dev->dqm->sdma_queue_count
+ 			>= get_num_sdma_queues(dev->dqm)) {
+-			pr_debug("Over-subscription is not allowed for SDMA.\n");
++			pr_debug("Over-subscription is not allowed for SDMA\n");
+ 			retval = -EPERM;
+ 			goto err_create_queue;
+ 		}
+@@ -206,7 +206,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	case KFD_QUEUE_TYPE_COMPUTE:
+ 		/* check if there is over subscription */
+                if ((dev->dqm->sched_policy ==
+-		    KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
++			KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
+ 		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
+ 		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
+ 			pr_debug("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
+@@ -241,8 +241,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	}
+ 
+ 	if (retval != 0) {
+-		pr_err("Pasid %d DQM create queue %d failed. ret %d\n",
+-			pqm->process->pasid, type, retval);
++		pr_err("DQM create queue failed\n");
+ 		goto err_create_queue;
+ 	}
+ 
+@@ -318,16 +317,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 
+ 	if (pqn->q) {
+ 		dqm = pqn->q->device->dqm;
++		kfree(pqn->q->properties.cu_mask);
++		pqn->q->properties.cu_mask = NULL;
+ 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+ 		if (retval) {
+-			pr_err("Pasid %d destroy queue %d failed, ret %d\n",
+-				pqm->process->pasid,
+-				pqn->q->properties.queue_id, retval);
+-			if (retval != -ETIME)
+-				goto err_destroy_queue;
++			pr_debug("Destroy queue failed, returned %d\n", retval);
++			goto err_destroy_queue;
+ 		}
+-		kfree(pqn->q->properties.cu_mask);
+-		pqn->q->properties.cu_mask = NULL;
+ 		uninit_queue(pqn->q);
+ 	}
+ 
+@@ -439,7 +435,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
+ 	struct process_queue_node *pqn;
+ 	struct queue *q;
+ 	enum KFD_MQD_TYPE mqd_type;
+-	struct mqd_manager *mqd_mgr;
++	struct mqd_manager *mqd_manager;
+ 	int r = 0;
+ 
+ 	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+@@ -462,11 +458,11 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
+ 					   q->properties.type, q->device->id);
+ 				continue;
+ 			}
+-			mqd_mgr = q->device->dqm->ops.get_mqd_manager(
++			mqd_manager = q->device->dqm->ops.get_mqd_manager(
+ 				q->device->dqm, mqd_type);
+ 		} else if (pqn->kq) {
+ 			q = pqn->kq->queue;
+-			mqd_mgr = pqn->kq->mqd_mgr;
++			mqd_manager = pqn->kq->mqd;
+ 			switch (q->properties.type) {
+ 			case KFD_QUEUE_TYPE_DIQ:
+ 				seq_printf(m, "  DIQ on device %x\n",
+@@ -486,7 +482,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
+ 			continue;
+ 		}
+ 
+-		r = mqd_mgr->debugfs_show_mqd(m, q->mqd);
++		r = mqd_manager->debugfs_show_mqd(m, q->mqd);
+ 		if (r != 0)
+ 			break;
+ 	}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+index 6dcd621..a5315d4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+@@ -36,8 +36,8 @@ void print_queue_properties(struct queue_properties *q)
+ 	pr_debug("Queue Address: 0x%llX\n", q->queue_address);
+ 	pr_debug("Queue Id: %u\n", q->queue_id);
+ 	pr_debug("Queue Process Vmid: %u\n", q->vmid);
+-	pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr);
+-	pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr);
++	pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr);
++	pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr);
+ 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr);
+ 	pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off);
+ }
+@@ -53,8 +53,8 @@ void print_queue(struct queue *q)
+ 	pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address);
+ 	pr_debug("Queue Id: %u\n", q->properties.queue_id);
+ 	pr_debug("Queue Process Vmid: %u\n", q->properties.vmid);
+-	pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr);
+-	pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr);
++	pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr);
++	pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr);
+ 	pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr);
+ 	pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off);
+ 	pr_debug("Queue MQD Address: 0x%p\n", q->mqd);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+index 3454514..985855f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+@@ -25,7 +25,7 @@
+ #include <linux/pid.h>
+ #include <linux/err.h>
+ #include <linux/slab.h>
+-#include <drm/amd_rdma.h>
++#include "amd_rdma.h"
+ #include "kfd_priv.h"
+ 
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+index 7702156..320c8d3 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+@@ -196,7 +196,6 @@ struct kfd_topology_device *kfd_create_topology_device(
+ 	return dev;
+ }
+ 
+-
+ #define sysfs_show_gen_prop(buffer, fmt, ...) \
+ 		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
+ #define sysfs_show_32bit_prop(buffer, name, value) \
+@@ -740,7 +739,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 	}
+ 
+ 	/* All hardware blocks have the same number of attributes. */
+-	num_attrs = ARRAY_SIZE(perf_attr_iommu);
++	num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
+ 	list_for_each_entry(perf, &dev->perf_props, list) {
+ 		perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr)
+ 			* num_attrs + sizeof(struct attribute_group),
+@@ -891,8 +890,7 @@ static void kfd_debug_print_topology(void)
+ 	up_read(&topology_lock);
+ }
+ 
+-/* Helper function for intializing platform_xx members of
+- * kfd_system_properties. Uses OEM info from the last CPU/APU node.
++/* Helper function for intializing platform_xx members of kfd_system_properties
+  */
+ static void kfd_update_system_properties(void)
+ {
+@@ -1015,12 +1013,13 @@ int kfd_topology_init(void)
+ 	 */
+ #ifdef CONFIG_ACPI
+ 	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
+-	if (!ret) {
++	if (ret == 0) {
+ 		ret = kfd_parse_crat_table(crat_image,
+ 					   &temp_topology_device_list,
+ 					   proximity_domain);
+ 		if (ret ||
+-		    kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
++			kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
++
+ 			kfd_release_topology_device_list(
+ 				&temp_topology_device_list);
+ 			kfd_destroy_crat_image(crat_image);
+@@ -1030,8 +1029,8 @@ int kfd_topology_init(void)
+ #endif
+ 	if (!crat_image) {
+ 		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
+-						    COMPUTE_UNIT_CPU, NULL,
+-						    proximity_domain);
++				COMPUTE_UNIT_CPU, NULL,
++				proximity_domain);
+ 		cpu_only_node = 1;
+ 		if (ret) {
+ 			pr_err("Error creating VCRAT table for CPU\n");
+@@ -1039,8 +1038,8 @@ int kfd_topology_init(void)
+ 		}
+ 
+ 		ret = kfd_parse_crat_table(crat_image,
+-					   &temp_topology_device_list,
+-					   proximity_domain);
++				&temp_topology_device_list,
++				proximity_domain);
+ 		if (ret) {
+ 			pr_err("Error parsing VCRAT table for CPU\n");
+ 			goto err;
+@@ -1052,12 +1051,12 @@ int kfd_topology_init(void)
+ 
+ 	down_write(&topology_lock);
+ 	kfd_topology_update_device_list(&temp_topology_device_list,
+-					&topology_device_list);
++					    &topology_device_list);
+ 	atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
+ 	ret = kfd_topology_update_sysfs();
+ 	up_write(&topology_lock);
+ 
+-	if (!ret) {
++	if (ret == 0) {
+ 		sys_props.generation_count++;
+ 		kfd_update_system_properties();
+ 		kfd_debug_print_topology();
+@@ -1145,6 +1144,7 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
+ 			break;
+ 		}
+ 	up_write(&topology_lock);
++
+ 	return out_dev;
+ }
+ 
+@@ -1182,40 +1182,17 @@ static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
+ 
+ static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
+ {
+-	struct kfd_iolink_properties *link, *cpu_link;
+-	struct kfd_topology_device *cpu_dev;
+-	uint32_t cap;
+-	uint32_t cpu_flag = CRAT_IOLINK_FLAGS_ENABLED;
+-	uint32_t flag = CRAT_IOLINK_FLAGS_ENABLED;
++	struct kfd_iolink_properties *link;
+ 
+ 	if (!dev || !dev->gpu)
+ 		return;
+ 
+-	pcie_capability_read_dword(dev->gpu->pdev,
+-			PCI_EXP_DEVCAP2, &cap);
+-
+-	if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+-			PCI_EXP_DEVCAP2_ATOMIC_COMP64)))
+-		cpu_flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+-			CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+-
+-	if (!dev->gpu->pci_atomic_requested ||
+-		dev->gpu->device_info->asic_family == CHIP_HAWAII)
+-			flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
++	/* GPU only creates direck links so apply flags setting to all */
++	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
++		list_for_each_entry(link, &dev->io_link_props, list)
++			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
++				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+ 				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+-
+-	/* GPU only creates direct links so apply flags setting to all */
+-	list_for_each_entry(link, &dev->io_link_props, list) {
+-		link->flags = flag;
+-		cpu_dev = kfd_topology_device_by_proximity_domain(
+-				link->node_to);
+-		if (cpu_dev) {
+-			list_for_each_entry(cpu_link,
+-					&cpu_dev->io_link_props, list)
+-				if (cpu_link->node_to == link->node_from)
+-					cpu_link->flags = cpu_flag;
+-		}
+-	}
+ }
+ 
+ int kfd_topology_add_device(struct kfd_dev *gpu)
+@@ -1235,7 +1212,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
+ 
+ 	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+ 
+-	proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
++	proximity_domain = atomic_inc_return(&
++				topology_crat_proximity_domain);
+ 
+ 	/* Check to see if this gpu device exists in the topology_device_list.
+ 	 * If so, assign the gpu to that device,
+@@ -1246,16 +1224,15 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
+ 	dev = kfd_assign_gpu(gpu);
+ 	if (!dev) {
+ 		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+-						    COMPUTE_UNIT_GPU, gpu,
+-						    proximity_domain);
++				COMPUTE_UNIT_GPU,
++				gpu, proximity_domain);
+ 		if (res) {
+ 			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+ 			       gpu_id);
+ 			return res;
+ 		}
+ 		res = kfd_parse_crat_table(crat_image,
+-					   &temp_topology_device_list,
+-					   proximity_domain);
++				&temp_topology_device_list, proximity_domain);
+ 		if (res) {
+ 			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+ 			       gpu_id);
+@@ -1272,13 +1249,14 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
+ 		res = kfd_topology_update_sysfs();
+ 		up_write(&topology_lock);
+ 
+-		if (!res)
++		if (res == 0)
+ 			sys_props.generation_count++;
+ 		else
+ 			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+ 						gpu_id, res);
+ 		dev = kfd_assign_gpu(gpu);
+-		if (WARN_ON(!dev)) {
++		if (!dev) {
++			pr_err("Could not assign GPU\n");
+ 			res = -ENODEV;
+ 			goto err;
+ 		}
+@@ -1331,22 +1309,20 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
+ 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+ 		break;
+ 	case CHIP_VEGA10:
+-	case CHIP_VEGA20:
+ 	case CHIP_RAVEN:
+ 		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+ 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+ 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+ 		break;
+ 	default:
+-		WARN(1, "Unexpected ASIC family %u",
+-		     dev->gpu->device_info->asic_family);
++		BUG();
+ 	}
+ 
+ 	/* Fix errors in CZ CRAT.
+-	 * simd_count: Carrizo CRAT reports wrong simd_count, probably
+-	 *		because it doesn't consider masked out CUs
+-	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
+-	 * capability flag: Carrizo CRAT doesn't report IOMMU flags
++	 * simd_count: Carrizo CRAT reports wrong simd_count, probably because
++	 *		it doesn't consider masked out CUs
++	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd.
++	 * capability flag: Carrizo CRAT doesn't report IOMMU flags.
+ 	 */
+ 	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
+ 		dev->node_props.simd_count =
+@@ -1386,7 +1362,7 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
+ 
+ 	up_write(&topology_lock);
+ 
+-	if (!res)
++	if (res == 0)
+ 		kfd_notify_gpu_change(gpu_id, 0);
+ 
+ 	return res;
+@@ -1427,7 +1403,7 @@ static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
+ {
+ 	int first_cpu_of_numa_node;
+ 
+-	if (!cpumask || cpumask == cpu_none_mask)
++	if (!cpumask || (cpumask == cpu_none_mask))
+ 		return -1;
+ 	first_cpu_of_numa_node = cpumask_first(cpumask);
+ 	if (first_cpu_of_numa_node >= nr_cpu_ids)
+@@ -1470,7 +1446,7 @@ int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
+ 
+ 		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+ 		r = dqm_debugfs_hqds(m, dev->gpu->dqm);
+-		if (r)
++		if (r != 0)
+ 			break;
+ 	}
+ 
+@@ -1495,7 +1471,7 @@ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
+ 
+ 		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+ 		r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets);
+-		if (r)
++		if (r != 0)
+ 			break;
+ 	}
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+index 2b36baf..f4d29c4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+@@ -46,6 +46,9 @@
+ #define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
+ #define HSA_CAP_DOORBELL_TYPE_1_0		0x1
+ #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
++#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
++#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
++#define HSA_CAP_DOORBELL_PACKET_TYPE		0x00001000
+ #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
+ 
+ struct kfd_node_properties {
+@@ -166,9 +169,9 @@ struct kfd_topology_device {
+ 	struct attribute		attr_gpuid;
+ 	struct attribute		attr_name;
+ 	struct attribute		attr_props;
+-	uint8_t				oem_id[CRAT_OEMID_LENGTH];
+-	uint8_t				oem_table_id[CRAT_OEMTABLEID_LENGTH];
+-	uint32_t			oem_revision;
++	uint8_t		oem_id[CRAT_OEMID_LENGTH];
++	uint8_t		oem_table_id[CRAT_OEMTABLEID_LENGTH];
++	uint32_t	oem_revision;
+ };
+ 
+ struct kfd_system_properties {
+@@ -187,8 +190,4 @@ struct kfd_topology_device *kfd_create_topology_device(
+ 		struct list_head *device_list);
+ void kfd_release_topology_device_list(struct list_head *device_list);
+ 
+-extern bool amd_iommu_pc_supported(void);
+-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+-
+ #endif /* __KFD_TOPOLOGY_H__ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+index 0bc0b25..e00d03d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
++++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright 2016-2018 Advanced Micro Devices, Inc.
++ * Copyright 2016 Advanced Micro Devices, Inc.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the "Software"),
+@@ -22,8 +22,45 @@
+ 
+ #ifndef HSA_SOC15_INT_H_INCLUDED
+ #define HSA_SOC15_INT_H_INCLUDED
++/*
++ * vega10+ IH clients
++ */
++enum soc15_ih_client_id {
++	SOC15_IH_CLIENTID_IH	    = 0x00,
++	SOC15_IH_CLIENTID_ACP	    = 0x01,
++	SOC15_IH_CLIENTID_ATHUB	    = 0x02,
++	SOC15_IH_CLIENTID_BIF	    = 0x03,
++	SOC15_IH_CLIENTID_DCE	    = 0x04,
++	SOC15_IH_CLIENTID_ISP	    = 0x05,
++	SOC15_IH_CLIENTID_PCIE0	    = 0x06,
++	SOC15_IH_CLIENTID_RLC	    = 0x07,
++	SOC15_IH_CLIENTID_SDMA0	    = 0x08,
++	SOC15_IH_CLIENTID_SDMA1	    = 0x09,
++	SOC15_IH_CLIENTID_SE0SH	    = 0x0a,
++	SOC15_IH_CLIENTID_SE1SH	    = 0x0b,
++	SOC15_IH_CLIENTID_SE2SH	    = 0x0c,
++	SOC15_IH_CLIENTID_SE3SH	    = 0x0d,
++	SOC15_IH_CLIENTID_SYSHUB    = 0x0e,
++	SOC15_IH_CLIENTID_THM	    = 0x0f,
++	SOC15_IH_CLIENTID_UVD	    = 0x10,
++	SOC15_IH_CLIENTID_VCE0	    = 0x11,
++	SOC15_IH_CLIENTID_VMC	    = 0x12,
++	SOC15_IH_CLIENTID_XDMA	    = 0x13,
++	SOC15_IH_CLIENTID_GRBM_CP   = 0x14,
++	SOC15_IH_CLIENTID_ATS	    = 0x15,
++	SOC15_IH_CLIENTID_ROM_SMUIO = 0x16,
++	SOC15_IH_CLIENTID_DF	    = 0x17,
++	SOC15_IH_CLIENTID_VCE1	    = 0x18,
++	SOC15_IH_CLIENTID_PWR	    = 0x19,
++	SOC15_IH_CLIENTID_UTCL2	    = 0x1b,
++	SOC15_IH_CLIENTID_EA	    = 0x1c,
++	SOC15_IH_CLIENTID_UTCL2LOG  = 0x1d,
++	SOC15_IH_CLIENTID_MP0	    = 0x1e,
++	SOC15_IH_CLIENTID_MP1	    = 0x1f,
++
++	SOC15_IH_CLIENTID_MAX
++};
+ 
+-#include "soc15_ih_clientid.h"
+ 
+ #define SOC15_INTSRC_CP_END_OF_PIPE	181
+ #define SOC15_INTSRC_CP_BAD_OPCODE	183
+-- 
+2.7.4
+