1 files changed, 20647 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3339-revert-to-old-stack.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3339-revert-to-old-stack.patch
new file mode 100644
index 00000000..eb3668aa
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3339-revert-to-old-stack.patch
@@ -0,0 +1,20647 @@
+From 37d3acaf273e6505bf399d1a4fefa7a32b967671 Mon Sep 17 00:00:00 2001
+From: Sanjay R Mehta <sanju.mehta@amd.com>
+Date: Wed, 16 May 2018 15:41:36 +0530
+Subject: [PATCH 3339/4131] revert to old stack
+
+Signed-off-by: Sanjay R Mehta <sanju.mehta@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/Kconfig                 |    3 +-
+ drivers/gpu/drm/amd/amdkfd/Makefile                |   21 +-
+ drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   |   74 +-
+ drivers/gpu/drm/amd/amdkfd/cik_int.h               |   24 +-
+ drivers/gpu/drm/amd/amdkfd/cik_regs.h              |    3 +-
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm  | 1384 -------------------
+ .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm  | 1419 --------------------
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c           | 1188 +---------------
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.c              | 1339 ------------------
+ drivers/gpu/drm/amd/amdkfd/kfd_crat.h              |   42 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |  133 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h            |   32 -
+ drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c            |    3 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c           |   75 --
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  734 +---------
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 1003 +++-----------
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |   59 +-
+ .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c  |   70 +-
+ .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c   |   83 --
+ .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c   |  112 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c          |  109 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_events.c            |  688 +++++-----
+ drivers/gpu/drm/amd/amdkfd/kfd_events.h            |   18 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |  119 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |  135 --
+ drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c         |   88 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_ipc.c               |  271 ----
+ drivers/gpu/drm/amd/amdkfd/kfd_ipc.h               |   51 -
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      |  105 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h      |   17 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c  |  128 --
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c   |  377 ------
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c   |  361 -----
+ drivers/gpu/drm/amd/amdkfd/kfd_module.c            |   66 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c       |   55 -
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h       |   15 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |  205 +--
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c    |  524 --------
+ drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |  294 +---
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  389 ++++--
+ drivers/gpu/drm/amd/amdkfd/kfd_pasid.c             |   90 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c        |  513 -------
+ drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h    |  583 --------
+ drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h   |   97 ++
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  537 ++------
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c           |  914 ++-----------
+ .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c |  232 +---
+ drivers/gpu/drm/amd/amdkfd/kfd_rdma.c              |  294 ----
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.c          | 1142 ++++++----------
+ drivers/gpu/drm/amd/amdkfd/kfd_topology.h          |   42 +-
+ drivers/gpu/drm/amd/amdkfd/soc15_int.h             |   84 --
+ 51 files changed, 2132 insertions(+), 14212 deletions(-)
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+ delete mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
+index 95be0dd..e13c67c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
++++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
+@@ -4,7 +4,6 @@
+ 
+ config HSA_AMD
+ 	tristate "HSA kernel driver for AMD GPU devices"
+-	depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64)
+-	select DRM_AMDGPU_USERPTR
++	depends on (DRM_RADEON || DRM_AMDGPU) && AMD_IOMMU_V2 && X86_64
+ 	help
+ 	  Enable this if you want to use HSA features on AMD GPU devices.
+diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
+index fde693c..b400d56 100644
+--- a/drivers/gpu/drm/amd/amdkfd/Makefile
++++ b/drivers/gpu/drm/amd/amdkfd/Makefile
+@@ -2,25 +2,18 @@
+ # Makefile for Heterogenous System Architecture support for AMD GPU devices
+ #
+ 
+-FULL_AMD_PATH=$(src)/..
+-
+-ccflags-y := -I$(FULL_AMD_PATH)/include/  \
+-		-I$(FULL_AMD_PATH)/include/asic_reg
++ccflags-y := -Idrivers/gpu/drm/amd/include/  \
++		-Idrivers/gpu/drm/amd/include/asic_reg
+ 
+ amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
+ 		kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \
+ 		kfd_process.o kfd_queue.o kfd_mqd_manager.o \
+ 		kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \
+-		kfd_mqd_manager_v9.o \
+ 		kfd_kernel_queue.o kfd_kernel_queue_cik.o \
+-		kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \
+-		kfd_packet_manager.o kfd_process_queue_manager.o \
+-		kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \
+-		kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \
+-		kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \
+-		kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \
+-		kfd_peerdirect.o kfd_ipc.o
+-
+-amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
++		kfd_kernel_queue_vi.o kfd_packet_manager.o \
++		kfd_process_queue_manager.o kfd_device_queue_manager.o \
++		kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
++		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
++		kfd_dbgdev.o kfd_dbgmgr.o
+ 
+ obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+index 751c004..211fc48 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+@@ -24,90 +24,40 @@
+ #include "kfd_events.h"
+ #include "cik_int.h"
+ 
+-static bool is_cpc_vm_fault(struct kfd_dev *dev,
+-					const uint32_t *ih_ring_entry)
+-{
+-	const struct cik_ih_ring_entry *ihre =
+-			(const struct cik_ih_ring_entry *)ih_ring_entry;
+-
+-	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+-		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+-	    ihre->vmid >= dev->vm_info.first_vmid_kfd &&
+-	    ihre->vmid <= dev->vm_info.last_vmid_kfd)
+-		return true;
+-	return false;
+-}
+-
+ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
+-					const uint32_t *ih_ring_entry,
+-					uint32_t *patched_ihre,
+-					bool *patched_flag)
++					const uint32_t *ih_ring_entry)
+ {
++	unsigned int pasid;
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+-	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+-	struct cik_ih_ring_entry *tmp_ihre =
+-			(struct cik_ih_ring_entry *) patched_ihre;
+ 
+-	/* This workaround is due to HW/FW limitation on Hawaii that
+-	 * VMID and PASID are not written into ih_ring_entry
+-	 */
+-	if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+-		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+-		dev->device_info->asic_family == CHIP_HAWAII) {
+-		*patched_flag = true;
+-		*tmp_ihre = *ihre;
++	pasid = (ihre->ring_id & 0xffff0000) >> 16;
+ 
+-		tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
+-		tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid(
+-						 dev->kgd, tmp_ihre->vmid);
+-		return (tmp_ihre->pasid != 0) &&
+-			tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd &&
+-			tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd;
+-	}
+ 	/* Do not process in ISR, just request it to be forwarded to WQ. */
+-	return (ihre->pasid != 0) &&
++	return (pasid != 0) &&
+ 		(ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
+-		ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
+ 		ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
+-		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+-		is_cpc_vm_fault(dev, ih_ring_entry));
++		ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE);
+ }
+ 
+ static void cik_event_interrupt_wq(struct kfd_dev *dev,
+ 					const uint32_t *ih_ring_entry)
+ {
++	unsigned int pasid;
+ 	const struct cik_ih_ring_entry *ihre =
+ 			(const struct cik_ih_ring_entry *)ih_ring_entry;
+-	uint32_t context_id = ihre->data & 0xfffffff;
+ 
+-	if (ihre->pasid == 0)
++	pasid = (ihre->ring_id & 0xffff0000) >> 16;
++
++	if (pasid == 0)
+ 		return;
+ 
+ 	if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE)
+-		kfd_signal_event_interrupt(ihre->pasid, context_id, 28);
+-	else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP)
+-		kfd_signal_event_interrupt(ihre->pasid, context_id, 28);
++		kfd_signal_event_interrupt(pasid, 0, 0);
+ 	else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG)
+-		kfd_signal_event_interrupt(ihre->pasid, context_id & 0xff, 8);
++		kfd_signal_event_interrupt(pasid, ihre->data & 0xFF, 8);
+ 	else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
+-		kfd_signal_hw_exception_event(ihre->pasid);
+-	else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+-		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+-		struct kfd_vm_fault_info info;
+-
+-		kfd_process_vm_fault(dev->dqm, ihre->pasid);
+-
+-		memset(&info, 0, sizeof(info));
+-		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+-		if (!info.page_addr && !info.status)
+-			return;
+-
+-		if (info.vmid == ihre->vmid)
+-			kfd_signal_vm_fault_event(dev, ihre->pasid, &info);
+-		else
+-			kfd_signal_vm_fault_event(dev, ihre->pasid, NULL);
+-	}
++		kfd_signal_hw_exception_event(pasid);
+ }
+ 
+ const struct kfd_event_interrupt_class event_interrupt_class_cik = {
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+index ff8255d..79a16d2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h
++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h
+@@ -26,32 +26,16 @@
+ #include <linux/types.h>
+ 
+ struct cik_ih_ring_entry {
+-	uint32_t source_id:8;
+-	uint32_t reserved1:8;
+-	uint32_t reserved2:16;
+-
+-	uint32_t data:28;
+-	uint32_t reserved3:4;
+-
+-	/* pipeid, meid and unused3 are officially called RINGID,
+-	 * but for our purposes, they always decode into pipe and ME.
+-	 */
+-	uint32_t pipeid:2;
+-	uint32_t meid:2;
+-	uint32_t reserved4:4;
+-	uint32_t vmid:8;
+-	uint32_t pasid:16;
+-
+-	uint32_t reserved5;
++	uint32_t source_id;
++	uint32_t data;
++	uint32_t ring_id;
++	uint32_t reserved;
+ };
+ 
+ #define CIK_INTSRC_DEQUEUE_COMPLETE	0xC6
+ #define CIK_INTSRC_CP_END_OF_PIPE	0xB5
+ #define CIK_INTSRC_CP_BAD_OPCODE	0xB7
+ #define CIK_INTSRC_SQ_INTERRUPT_MSG	0xEF
+-#define CIK_INTSRC_GFX_PAGE_INV_FAULT	0x92
+-#define CIK_INTSRC_GFX_MEM_PROT_FAULT	0x93
+-#define CIK_INTSRC_SDMA_TRAP		0xE0
+ 
+ #endif
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+index 37ce6dd..48769d1 100644
+--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h
++++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h
+@@ -33,8 +33,7 @@
+ #define	APE1_MTYPE(x)					((x) << 7)
+ 
+ /* valid for both DEFAULT_MTYPE and APE1_MTYPE */
+-#define	MTYPE_CACHED_NV					0
+-#define	MTYPE_CACHED					1
++#define	MTYPE_CACHED					0
+ #define	MTYPE_NONCACHED					3
+ 
+ #define	DEFAULT_CP_HQD_PERSISTENT_STATE			(0x33U << 8)
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+deleted file mode 100644
+index 751cc2e..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
++++ /dev/null
+@@ -1,1384 +0,0 @@
+-/*
+- * Copyright 2015-2017 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#if 0
+-HW (VI) source code for CWSR trap handler
+-#Version 18 + multiple trap handler
+-
+-// this performance-optimal version was originally from Seven Xu at SRDC
+-
+-// Revison #18   --...
+-/* Rev History
+-** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+-** #4. SR Memory Layout:
+-**             1. VGPR-SGPR-HWREG-{LDS}
+-**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+-** #7. Update: 1. don't barrier if noLDS
+-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+-**             2. Fix SQ issue by s_sleep 2
+-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+-**             2. optimize s_buffer save by burst 16sgprs...
+-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+-** #11. Update 1. Add 2 more timestamp for debug version
+-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+-** #13. Integ  1. Always use MUBUF for PV trap shader...
+-** #14. Update 1. s_buffer_store soft clause...
+-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+-**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+-**             2. FUNC - Handle non-CWSR traps
+-*/
+-
+-var G8SR_WDMEM_HWREG_OFFSET = 0
+-var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
+-
+-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+-
+-var G8SR_DEBUG_TIMESTAMP = 0
+-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+-var s_g8sr_ts_save_s    = s[34:35]   // save start
+-var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
+-var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
+-var s_g8sr_ts_save_d    = s[40:41]   // save end
+-var s_g8sr_ts_restore_s = s[42:43]   // restore start
+-var s_g8sr_ts_restore_d = s[44:45]   // restore end
+-
+-var G8SR_VGPR_SR_IN_DWX4 = 0
+-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
+-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+-
+-
+-/*************************************************************************/
+-/*                  control on how to run the shader                     */
+-/*************************************************************************/
+-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+-var EMU_RUN_HACK                    =   0
+-var EMU_RUN_HACK_RESTORE_NORMAL     =   0
+-var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
+-var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
+-var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+-var SAVE_LDS                        =   1
+-var WG_BASE_ADDR_LO                 =   0x9000a000
+-var WG_BASE_ADDR_HI                 =   0x0
+-var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
+-var CTX_SAVE_CONTROL                =   0x0
+-var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
+-var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+-var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
+-var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+-var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
+-
+-/**************************************************************************/
+-/*                      variables                                         */
+-/**************************************************************************/
+-var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+-var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+-var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+-
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+-
+-var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
+-var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
+-var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
+-
+-var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
+-var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
+-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
+-var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
+-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
+-
+-var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
+-var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
+-
+-
+-/*      Save        */
+-var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
+-var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+-
+-var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
+-var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
+-var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
+-var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
+-var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
+-var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
+-
+-var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
+-var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
+-
+-var s_save_spi_init_lo              =   exec_lo
+-var s_save_spi_init_hi              =   exec_hi
+-
+-                                                //tba_lo and tba_hi need to be saved/restored
+-var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+-var s_save_pc_hi            =   ttmp1
+-var s_save_exec_lo          =   ttmp2
+-var s_save_exec_hi          =   ttmp3
+-var s_save_status           =   ttmp4
+-var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
+-var s_save_xnack_mask_lo    =   ttmp6
+-var s_save_xnack_mask_hi    =   ttmp7
+-var s_save_buf_rsrc0        =   ttmp8
+-var s_save_buf_rsrc1        =   ttmp9
+-var s_save_buf_rsrc2        =   ttmp10
+-var s_save_buf_rsrc3        =   ttmp11
+-
+-var s_save_mem_offset       =   tma_lo
+-var s_save_alloc_size       =   s_save_trapsts          //conflict
+-var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
+-var s_save_m0               =   tma_hi
+-
+-/*      Restore     */
+-var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
+-var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
+-
+-var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
+-var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
+-var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
+-var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
+-
+-var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
+-var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
+-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
+-
+-var s_restore_spi_init_lo                   =   exec_lo
+-var s_restore_spi_init_hi                   =   exec_hi
+-
+-var s_restore_mem_offset        =   ttmp2
+-var s_restore_alloc_size        =   ttmp3
+-var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
+-var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
+-
+-var s_restore_m0            =   s_restore_alloc_size    //no conflict
+-
+-var s_restore_mode          =   ttmp7
+-
+-var s_restore_pc_lo         =   ttmp0
+-var s_restore_pc_hi         =   ttmp1
+-var s_restore_exec_lo       =   tma_lo                  //no conflict
+-var s_restore_exec_hi       =   tma_hi                  //no conflict
+-var s_restore_status        =   ttmp4
+-var s_restore_trapsts       =   ttmp5
+-var s_restore_xnack_mask_lo =   xnack_mask_lo
+-var s_restore_xnack_mask_hi =   xnack_mask_hi
+-var s_restore_buf_rsrc0     =   ttmp8
+-var s_restore_buf_rsrc1     =   ttmp9
+-var s_restore_buf_rsrc2     =   ttmp10
+-var s_restore_buf_rsrc3     =   ttmp11
+-
+-/**************************************************************************/
+-/*                      trap handler entry points                         */
+-/**************************************************************************/
+-/* Shader Main*/
+-
+-shader main
+-  asic(VI)
+-  type(CS)
+-
+-
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
+-        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
+-        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
+-        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+-        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
+-        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
+-        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
+-    else
+-        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
+-    end
+-
+-L_JUMP_TO_RESTORE:
+-    s_branch L_RESTORE                                              //restore
+-
+-L_SKIP_RESTORE:
+-
+-    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
+-    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+-    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
+-
+-    // *********    Handle non-CWSR traps       *******************
+-if (!EMU_RUN_HACK)
+-    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
+-    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
+-    s_waitcnt lgkmcnt(0)
+-    s_or_b32        ttmp7, ttmp8, ttmp9
+-    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
+-    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+-    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
+-
+-L_NO_NEXT_TRAP:
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+-    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
+-    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
+-    s_addc_u32  ttmp1, ttmp1, 0
+-L_EXCP_CASE:
+-    s_and_b32   ttmp1, ttmp1, 0xFFFF
+-    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+-    s_rfe_b64       [ttmp0, ttmp1]
+-end
+-    // *********        End handling of non-CWSR traps   *******************
+-
+-/**************************************************************************/
+-/*                      save routine                                      */
+-/**************************************************************************/
+-
+-L_SAVE:
+-
+-if G8SR_DEBUG_TIMESTAMP
+-        s_memrealtime   s_g8sr_ts_save_s
+-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+-end
+-
+-    //check whether there is mem_viol
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+-    s_cbranch_scc0  L_NO_PC_REWIND
+-
+-    //if so, need rewind PC assuming GDS operation gets NACKed
+-    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
+-    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+-    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
+-    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
+-
+-L_NO_PC_REWIND:
+-    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
+-
+-    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
+-    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
+-    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+-    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
+-    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
+-    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+-
+-    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
+-
+-    /*      inform SPI the readiness and wait for SPI's go signal */
+-    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
+-    s_mov_b32       s_save_exec_hi, exec_hi
+-    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
+-
+-if G8SR_DEBUG_TIMESTAMP
+-        s_memrealtime  s_g8sr_ts_sq_save_msg
+-        s_waitcnt lgkmcnt(0)
+-end
+-
+-    if (EMU_RUN_HACK)
+-
+-    else
+-        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+-    end
+-
+-  L_SLEEP:
+-    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+-
+-    if (EMU_RUN_HACK)
+-
+-    else
+-        s_cbranch_execz L_SLEEP
+-    end
+-
+-if G8SR_DEBUG_TIMESTAMP
+-        s_memrealtime  s_g8sr_ts_spi_wrexec
+-        s_waitcnt lgkmcnt(0)
+-end
+-
+-    /*      setup Resource Contants    */
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+-        //calculate wd_addr using absolute thread id
+-        v_readlane_b32 s_save_tmp, v9, 0
+-        s_lshr_b32 s_save_tmp, s_save_tmp, 6
+-        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+-        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+-        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+-        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+-    else
+-    end
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+-        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+-        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+-        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+-    else
+-    end
+-
+-
+-    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
+-    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
+-    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+-    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+-    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
+-    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+-    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
+-    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
+-    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+-    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
+-    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
+-
+-    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
+-    s_mov_b32       s_save_m0,          m0                                                                  //save M0
+-
+-    /*      global mem offset           */
+-    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
+-
+-
+-
+-
+-    /*      save HW registers   */
+-    //////////////////////////////
+-
+-  L_SAVE_HWREG:
+-        // HWREG SR memory offset : size(VGPR)+size(SGPR)
+-       get_vgpr_size_bytes(s_save_mem_offset)
+-       get_sgpr_size_bytes(s_save_tmp)
+-       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+-
+-
+-    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+-    end
+-
+-
+-    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
+-
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+-        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+-        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+-        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
+-        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
+-    end
+-
+-    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
+-    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+-    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
+-    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+-    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
+-
+-    //s_save_trapsts conflicts with s_save_alloc_size
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
+-
+-    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
+-    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
+-
+-    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+-    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
+-    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+-    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
+-    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
+-
+-
+-
+-    /*      the first wave in the threadgroup    */
+-        // save fist_wave bits in tba_hi unused bit.26
+-    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
+-    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
+-    s_mov_b32        s_save_exec_hi, 0x0
+-    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
+-
+-
+-    /*          save SGPRs      */
+-        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+-    //////////////////////////////
+-
+-    // SGPR SR memory offset : size(VGPR)
+-    get_vgpr_size_bytes(s_save_mem_offset)
+-    // TODO, change RSRC word to rearrange memory layout for SGPRS
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
+-    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+-    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+-
+-    if (SGPR_SAVE_USE_SQC)
+-        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
+-    else
+-        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
+-    end
+-
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+-    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+-    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+-    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+-    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+-
+-    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
+-  L_SAVE_SGPR_LOOP:
+-    // SGPR is allocated in 16 SGPR granularity
+-    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
+-    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
+-    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
+-    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
+-    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
+-    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
+-    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
+-    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
+-
+-    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+-    s_add_u32       m0, m0, 16                                                      //next sgpr index
+-    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
+-    // restore s_save_buf_rsrc0,1
+-    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+-    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+-
+-
+-
+-
+-    /*          save first 4 VGPR, then LDS save could use   */
+-        // each wave will alloc 4 vgprs at least...
+-    /////////////////////////////////////////////////////////////////////////////////////
+-
+-    s_mov_b32       s_save_mem_offset, 0
+-    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+-    s_mov_b32       exec_hi, 0xFFFFFFFF
+-
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // VGPR Allocated in 4-GPR granularity
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-        // the const stride for DWx4 is 4*4 bytes
+-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-
+-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+-else
+-        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+-        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+-        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+-end
+-
+-
+-
+-    /*          save LDS        */
+-    //////////////////////////////
+-
+-  L_SAVE_LDS:
+-
+-        // Change EXEC to all threads...
+-    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
+-    s_mov_b32       exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
+-    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
+-    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
+-
+-    s_barrier               //LDS is used? wait for other waves in the same TG
+-    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+-    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+-    s_cbranch_scc0  L_SAVE_LDS_DONE
+-
+-        // first wave do LDS save;
+-
+-    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
+-    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
+-    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
+-
+-    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+-    //
+-    get_vgpr_size_bytes(s_save_mem_offset)
+-    get_sgpr_size_bytes(s_save_tmp)
+-    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
+-    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+-
+-
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
+-    end
+-
+-    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
+-
+-
+-var LDS_DMA_ENABLE = 0
+-var UNROLL = 0
+-if UNROLL==0 && LDS_DMA_ENABLE==1
+-        s_mov_b32  s3, 256*2
+-        s_nop 0
+-        s_nop 0
+-        s_nop 0
+-  L_SAVE_LDS_LOOP:
+-        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+-    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
+-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
+-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+-    end
+-
+-    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
+-    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
+-    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
+-
+-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
+-      // store from higest LDS address to lowest
+-      s_mov_b32  s3, 256*2
+-      s_sub_u32  m0, s_save_alloc_size, s3
+-      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+-      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
+-      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
+-      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
+-      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
+-      s_nop 0
+-      s_nop 0
+-      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
+-      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
+-      s_add_u32   s0, s0,s_save_alloc_size
+-      s_addc_u32  s1, s1, 0
+-      s_setpc_b64 s[0:1]
+-
+-
+-       for var i =0; i< 128; i++
+-            // be careful to make here a 64Byte aligned address, which could improve performance...
+-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
+-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
+-
+-        if i!=127
+-        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
+-            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
+-            end
+-       end
+-
+-else   // BUFFER_STORE
+-      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+-      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
+-      v_mul_i32_i24 v2, v3, 8   // tid*8
+-      v_mov_b32 v3, 256*2
+-      s_mov_b32 m0, 0x10000
+-      s_mov_b32 s0, s_save_buf_rsrc3
+-      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
+-      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
+-
+-L_SAVE_LDS_LOOP_VECTOR:
+-      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
+-      s_waitcnt lgkmcnt(0)
+-      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
+-//      s_waitcnt vmcnt(0)
+-      v_add_u32 v2, vcc[0:1], v2, v3
+-      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+-      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+-
+-      // restore rsrc3
+-      s_mov_b32 s_save_buf_rsrc3, s0
+-
+-end
+-
+-L_SAVE_LDS_DONE:
+-
+-
+-    /*          save VGPRs  - set the Rest VGPRs        */
+-    //////////////////////////////////////////////////////////////////////////////////////
+-  L_SAVE_VGPR:
+-    // VGPR SR memory offset: 0
+-    // TODO rearrange the RSRC words to use swizzle for VGPR save...
+-
+-    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
+-    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+-    s_mov_b32       exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
+-    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+-    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
+-    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // VGPR Allocated in 4-GPR granularity
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-        // the const stride for DWx4 is 4*4 bytes
+-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-        s_mov_b32         m0, 4     // skip first 4 VGPRs
+-        s_cmp_lt_u32      m0, s_save_alloc_size
+-        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
+-
+-        s_set_gpr_idx_on  m0, 0x1   // This will change M0
+-        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
+-L_SAVE_VGPR_LOOP:
+-        v_mov_b32         v0, v0   // v0 = v[0+m0]
+-        v_mov_b32         v1, v1
+-        v_mov_b32         v2, v2
+-        v_mov_b32         v3, v3
+-
+-
+-        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-        s_add_u32         m0, m0, 4
+-        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
+-        s_cmp_lt_u32      m0, s_save_alloc_size
+-    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+-    s_set_gpr_idx_off
+-L_SAVE_VGPR_LOOP_END:
+-
+-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+-else
+-    // VGPR store using dw burst
+-    s_mov_b32         m0, 0x4   //VGPR initial index value =0
+-    s_cmp_lt_u32      m0, s_save_alloc_size
+-    s_cbranch_scc0    L_SAVE_VGPR_END
+-
+-
+-    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+-    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
+-
+-  L_SAVE_VGPR_LOOP:
+-    v_mov_b32       v0, v0              //v0 = v[0+m0]
+-    v_mov_b32       v1, v1              //v0 = v[0+m0]
+-    v_mov_b32       v2, v2              //v0 = v[0+m0]
+-    v_mov_b32       v3, v3              //v0 = v[0+m0]
+-
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+-        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+-        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+-    end
+-
+-    s_add_u32       m0, m0, 4                                                       //next vgpr index
+-    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
+-    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+-    s_set_gpr_idx_off
+-end
+-
+-L_SAVE_VGPR_END:
+-
+-
+-
+-
+-
+-
+-    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+-        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+-        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+-        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+-        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
+-    else
+-    end
+-
+-// Save Done timestamp
+-if G8SR_DEBUG_TIMESTAMP
+-        s_memrealtime   s_g8sr_ts_save_d
+-        // SGPR SR memory offset : size(VGPR)
+-        get_vgpr_size_bytes(s_save_mem_offset)
+-        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+-        // Need reset rsrc2??
+-        s_mov_b32 m0, s_save_mem_offset
+-        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
+-        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
+-end
+-
+-
+-    s_branch    L_END_PGM
+-
+-
+-
+-/**************************************************************************/
+-/*                      restore routine                                   */
+-/**************************************************************************/
+-
+-L_RESTORE:
+-    /*      Setup Resource Contants    */
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+-        //calculate wd_addr using absolute thread id
+-        v_readlane_b32 s_restore_tmp, v9, 0
+-        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+-        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+-        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+-        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+-        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+-    else
+-    end
+-
+-if G8SR_DEBUG_TIMESTAMP
+-        s_memrealtime   s_g8sr_ts_restore_s
+-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+-        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+-        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+-        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
+-end
+-
+-
+-
+-    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
+-    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
+-    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
+-    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
+-    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
+-    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+-    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
+-    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
+-    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+-    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
+-    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
+-
+-    /*      global mem offset           */
+-//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
+-
+-    /*      the first wave in the threadgroup    */
+-    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+-    s_cbranch_scc0  L_RESTORE_VGPR
+-
+-    /*          restore LDS     */
+-    //////////////////////////////
+-  L_RESTORE_LDS:
+-
+-    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+-    s_mov_b32       exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
+-    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
+-    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
+-    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
+-    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
+-    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
+-
+-    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+-    //
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
+-
+-
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+-    end
+-    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
+-
+-  L_RESTORE_LDS_LOOP:
+-    if (SAVE_LDS)
+-        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
+-        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
+-    end
+-    s_add_u32       m0, m0, 256*2                                               // 128 DW
+-    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
+-    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
+-
+-
+-    /*          restore VGPRs       */
+-    //////////////////////////////
+-  L_RESTORE_VGPR:
+-        // VGPR SR memory offset : 0
+-    s_mov_b32       s_restore_mem_offset, 0x0
+-    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+-    s_mov_b32       exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
+-    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+-    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+-    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+-    end
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-     get_vgpr_size_bytes(s_restore_mem_offset)
+-     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+-
+-     // the const stride for DWx4 is 4*4 bytes
+-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-     s_mov_b32         m0, s_restore_alloc_size
+-     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
+-
+-L_RESTORE_VGPR_LOOP:
+-     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+-     s_waitcnt vmcnt(0)
+-     s_sub_u32         m0, m0, 4
+-     v_mov_b32         v0, v0   // v[0+m0] = v0
+-     v_mov_b32         v1, v1
+-     v_mov_b32         v2, v2
+-     v_mov_b32         v3, v3
+-     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+-     s_cmp_eq_u32      m0, 0x8000
+-     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
+-     s_set_gpr_idx_off
+-
+-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
+-
+-else
+-    // VGPR load using dw burst
+-    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
+-    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
+-    s_mov_b32       m0, 4                               //VGPR initial index value = 1
+-    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+-    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
+-
+-  L_RESTORE_VGPR_LOOP:
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+-        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+-        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+-        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+-    end
+-    s_waitcnt       vmcnt(0)                                                                //ensure data ready
+-    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
+-    v_mov_b32       v1, v1
+-    v_mov_b32       v2, v2
+-    v_mov_b32       v3, v3
+-    s_add_u32       m0, m0, 4                                                               //next vgpr index
+-    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
+-    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
+-    s_set_gpr_idx_off
+-                                                                                            /* VGPR restore on v0 */
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
+-        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
+-        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
+-        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
+-    end
+-
+-end
+-
+-    /*          restore SGPRs       */
+-    //////////////////////////////
+-
+-    // SGPR SR memory offset : size(VGPR)
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
+-    // TODO, change RSRC word to rearrange memory layout for SGPRS
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
+-    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+-    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+-
+-    if (SGPR_SAVE_USE_SQC)
+-        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
+-    else
+-        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
+-    end
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+-    end
+-
+-    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
+-       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
+-    */
+-    s_mov_b32 m0, s_restore_alloc_size
+-
+- L_RESTORE_SGPR_LOOP:
+-    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
+-    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
+-
+-    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
+-
+-    s_movreld_b64   s0, s0      //s[0+m0] = s0
+-    s_movreld_b64   s2, s2
+-    s_movreld_b64   s4, s4
+-    s_movreld_b64   s6, s6
+-    s_movreld_b64   s8, s8
+-    s_movreld_b64   s10, s10
+-    s_movreld_b64   s12, s12
+-    s_movreld_b64   s14, s14
+-
+-    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
+-
+-    /*      restore HW registers    */
+-    //////////////////////////////
+-  L_RESTORE_HWREG:
+-
+-
+-if G8SR_DEBUG_TIMESTAMP
+-      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+-      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+-end
+-
+-    // HWREG SR memory offset : size(VGPR)+size(SGPR)
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-
+-
+-    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
+-    if (SWIZZLE_EN)
+-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+-    else
+-        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+-    end
+-
+-    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
+-    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
+-    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+-    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
+-    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+-    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
+-    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
+-    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
+-    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
+-    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
+-    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
+-    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
+-
+-    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
+-
+-    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
+-
+-    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+-        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
+-        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+-    end
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+-        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
+-        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+-    end
+-
+-    s_mov_b32       m0,         s_restore_m0
+-    s_mov_b32       exec_lo,    s_restore_exec_lo
+-    s_mov_b32       exec_hi,    s_restore_exec_hi
+-
+-    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+-    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+-    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+-    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+-    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
+-    //reuse s_restore_m0 as a temp register
+-    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+-    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+-    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+-    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
+-    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+-    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+-    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+-    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+-    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+-    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+-
+-    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+-    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
+-    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
+-
+-    s_barrier                                                   //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+-
+-if G8SR_DEBUG_TIMESTAMP
+-    s_memrealtime s_g8sr_ts_restore_d
+-    s_waitcnt lgkmcnt(0)
+-end
+-
+-//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
+-    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
+-
+-
+-/**************************************************************************/
+-/*                      the END                                           */
+-/**************************************************************************/
+-L_END_PGM:
+-    s_endpgm
+-
+-end
+-
+-
+-/**************************************************************************/
+-/*                      the helper functions                              */
+-/**************************************************************************/
+-
+-//Only for save hwreg to mem
+-function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+-        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
+-        s_mov_b32 m0, s_mem_offset
+-        s_buffer_store_dword s, s_rsrc, m0      glc:1
+-        s_add_u32       s_mem_offset, s_mem_offset, 4
+-        s_mov_b32   m0, exec_lo
+-end
+-
+-
+-// HWREG are saved before SGPRs, so all HWREG could be use.
+-function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+-
+-        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
+-        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
+-        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
+-        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+-        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
+-        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
+-end
+-
+-
+-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+-    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
+-    s_add_u32       s_mem_offset, s_mem_offset, 4
+-end
+-
+-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+-    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
+-    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
+-end
+-
+-
+-
+-function get_lds_size_bytes(s_lds_size_byte)
+-    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+-    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
+-    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
+-end
+-
+-function get_vgpr_size_bytes(s_vgpr_size_byte)
+-    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
+-    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
+-    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
+-end
+-
+-function get_sgpr_size_bytes(s_sgpr_size_byte)
+-    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
+-    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
+-    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
+-end
+-
+-function get_hwreg_size_bytes
+-    return 128 //HWREG size 128 bytes
+-end
+-
+-
+-#endif
+-
+-static const uint32_t cwsr_trap_gfx8_hex[] = {
+-	0xbf820001, 0xbf820123,
+-	0xb8f4f802, 0x89748674,
+-	0xb8f5f803, 0x8675ff75,
+-	0x00000400, 0xbf850011,
+-	0xc00a1e37, 0x00000000,
+-	0xbf8c007f, 0x87777978,
+-	0xbf840002, 0xb974f802,
+-	0xbe801d78, 0xb8f5f803,
+-	0x8675ff75, 0x000001ff,
+-	0xbf850002, 0x80708470,
+-	0x82718071, 0x8671ff71,
+-	0x0000ffff, 0xb974f802,
+-	0xbe801f70, 0xb8f5f803,
+-	0x8675ff75, 0x00000100,
+-	0xbf840006, 0xbefa0080,
+-	0xb97a0203, 0x8671ff71,
+-	0x0000ffff, 0x80f08870,
+-	0x82f18071, 0xbefa0080,
+-	0xb97a0283, 0xbef60068,
+-	0xbef70069, 0xb8fa1c07,
+-	0x8e7a9c7a, 0x87717a71,
+-	0xb8fa03c7, 0x8e7a9b7a,
+-	0x87717a71, 0xb8faf807,
+-	0x867aff7a, 0x00007fff,
+-	0xb97af807, 0xbef2007e,
+-	0xbef3007f, 0xbefe0180,
+-	0xbf900004, 0xbf8e0002,
+-	0xbf88fffe, 0xbef8007e,
+-	0x8679ff7f, 0x0000ffff,
+-	0x8779ff79, 0x00040000,
+-	0xbefa0080, 0xbefb00ff,
+-	0x00807fac, 0x867aff7f,
+-	0x08000000, 0x8f7a837a,
+-	0x877b7a7b, 0x867aff7f,
+-	0x70000000, 0x8f7a817a,
+-	0x877b7a7b, 0xbeef007c,
+-	0xbeee0080, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8fa1605, 0x807a817a,
+-	0x8e7a867a, 0x806e7a6e,
+-	0xbefa0084, 0xbefa00ff,
+-	0x01000000, 0xbefe007c,
+-	0xbefc006e, 0xc0611bfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611c3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611c7c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611cbc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611cfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611d3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xb8f5f803,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611d7c, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611dbc, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xbefe007c, 0xbefc006e,
+-	0xc0611dfc, 0x0000007c,
+-	0x806e846e, 0xbefc007e,
+-	0xb8eff801, 0xbefe007c,
+-	0xbefc006e, 0xc0611bfc,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611b3c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc006e, 0xc0611b7c,
+-	0x0000007c, 0x806e846e,
+-	0xbefc007e, 0x867aff7f,
+-	0x04000000, 0xbef30080,
+-	0x8773737a, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8f51605, 0x80758175,
+-	0x8e758475, 0x8e7a8275,
+-	0xbefa00ff, 0x01000000,
+-	0xbef60178, 0x80786e78,
+-	0x82798079, 0xbefc0080,
+-	0xbe802b00, 0xbe822b02,
+-	0xbe842b04, 0xbe862b06,
+-	0xbe882b08, 0xbe8a2b0a,
+-	0xbe8c2b0c, 0xbe8e2b0e,
+-	0xc06b003c, 0x00000000,
+-	0xc06b013c, 0x00000010,
+-	0xc06b023c, 0x00000020,
+-	0xc06b033c, 0x00000030,
+-	0x8078c078, 0x82798079,
+-	0x807c907c, 0xbf0a757c,
+-	0xbf85ffeb, 0xbef80176,
+-	0xbeee0080, 0xbefe00c1,
+-	0xbeff00c1, 0xbefa00ff,
+-	0x01000000, 0xe0724000,
+-	0x6e1e0000, 0xe0724100,
+-	0x6e1e0100, 0xe0724200,
+-	0x6e1e0200, 0xe0724300,
+-	0x6e1e0300, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f54306,
+-	0x8675c175, 0xbf84002c,
+-	0xbf8a0000, 0x867aff73,
+-	0x04000000, 0xbf840028,
+-	0x8e758675, 0x8e758275,
+-	0xbefa0075, 0xb8ee2a05,
+-	0x806e816e, 0x8e6e8a6e,
+-	0xb8fa1605, 0x807a817a,
+-	0x8e7a867a, 0x806e7a6e,
+-	0x806eff6e, 0x00000080,
+-	0xbefa00ff, 0x01000000,
+-	0xbefc0080, 0xd28c0002,
+-	0x000100c1, 0xd28d0003,
+-	0x000204c1, 0xd1060002,
+-	0x00011103, 0x7e0602ff,
+-	0x00000200, 0xbefc00ff,
+-	0x00010000, 0xbe80007b,
+-	0x867bff7b, 0xff7fffff,
+-	0x877bff7b, 0x00058000,
+-	0xd8ec0000, 0x00000002,
+-	0xbf8c007f, 0xe0765000,
+-	0x6e1e0002, 0x32040702,
+-	0xd0c9006a, 0x0000eb02,
+-	0xbf87fff7, 0xbefb0000,
+-	0xbeee00ff, 0x00000400,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f52a05, 0x80758175,
+-	0x8e758275, 0x8e7a8875,
+-	0xbefa00ff, 0x01000000,
+-	0xbefc0084, 0xbf0a757c,
+-	0xbf840015, 0xbf11017c,
+-	0x8075ff75, 0x00001000,
+-	0x7e000300, 0x7e020301,
+-	0x7e040302, 0x7e060303,
+-	0xe0724000, 0x6e1e0000,
+-	0xe0724100, 0x6e1e0100,
+-	0xe0724200, 0x6e1e0200,
+-	0xe0724300, 0x6e1e0300,
+-	0x807c847c, 0x806eff6e,
+-	0x00000400, 0xbf0a757c,
+-	0xbf85ffef, 0xbf9c0000,
+-	0xbf8200ca, 0xbef8007e,
+-	0x8679ff7f, 0x0000ffff,
+-	0x8779ff79, 0x00040000,
+-	0xbefa0080, 0xbefb00ff,
+-	0x00807fac, 0x8676ff7f,
+-	0x08000000, 0x8f768376,
+-	0x877b767b, 0x8676ff7f,
+-	0x70000000, 0x8f768176,
+-	0x877b767b, 0x8676ff7f,
+-	0x04000000, 0xbf84001e,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f34306, 0x8673c173,
+-	0xbf840019, 0x8e738673,
+-	0x8e738273, 0xbefa0073,
+-	0xb8f22a05, 0x80728172,
+-	0x8e728a72, 0xb8f61605,
+-	0x80768176, 0x8e768676,
+-	0x80727672, 0x8072ff72,
+-	0x00000080, 0xbefa00ff,
+-	0x01000000, 0xbefc0080,
+-	0xe0510000, 0x721e0000,
+-	0xe0510100, 0x721e0000,
+-	0x807cff7c, 0x00000200,
+-	0x8072ff72, 0x00000200,
+-	0xbf0a737c, 0xbf85fff6,
+-	0xbef20080, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f32a05,
+-	0x80738173, 0x8e738273,
+-	0x8e7a8873, 0xbefa00ff,
+-	0x01000000, 0xbef60072,
+-	0x8072ff72, 0x00000400,
+-	0xbefc0084, 0xbf11087c,
+-	0x8073ff73, 0x00008000,
+-	0xe0524000, 0x721e0000,
+-	0xe0524100, 0x721e0100,
+-	0xe0524200, 0x721e0200,
+-	0xe0524300, 0x721e0300,
+-	0xbf8c0f70, 0x7e000300,
+-	0x7e020301, 0x7e040302,
+-	0x7e060303, 0x807c847c,
+-	0x8072ff72, 0x00000400,
+-	0xbf0a737c, 0xbf85ffee,
+-	0xbf9c0000, 0xe0524000,
+-	0x761e0000, 0xe0524100,
+-	0x761e0100, 0xe0524200,
+-	0x761e0200, 0xe0524300,
+-	0x761e0300, 0xb8f22a05,
+-	0x80728172, 0x8e728a72,
+-	0xb8f61605, 0x80768176,
+-	0x8e768676, 0x80727672,
+-	0x80f2c072, 0xb8f31605,
+-	0x80738173, 0x8e738473,
+-	0x8e7a8273, 0xbefa00ff,
+-	0x01000000, 0xbefc0073,
+-	0xc031003c, 0x00000072,
+-	0x80f2c072, 0xbf8c007f,
+-	0x80fc907c, 0xbe802d00,
+-	0xbe822d02, 0xbe842d04,
+-	0xbe862d06, 0xbe882d08,
+-	0xbe8a2d0a, 0xbe8c2d0c,
+-	0xbe8e2d0e, 0xbf06807c,
+-	0xbf84fff1, 0xb8f22a05,
+-	0x80728172, 0x8e728a72,
+-	0xb8f61605, 0x80768176,
+-	0x8e768676, 0x80727672,
+-	0xbefa0084, 0xbefa00ff,
+-	0x01000000, 0xc0211cfc,
+-	0x00000072, 0x80728472,
+-	0xc0211c3c, 0x00000072,
+-	0x80728472, 0xc0211c7c,
+-	0x00000072, 0x80728472,
+-	0xc0211bbc, 0x00000072,
+-	0x80728472, 0xc0211bfc,
+-	0x00000072, 0x80728472,
+-	0xc0211d3c, 0x00000072,
+-	0x80728472, 0xc0211d7c,
+-	0x00000072, 0x80728472,
+-	0xc0211a3c, 0x00000072,
+-	0x80728472, 0xc0211a7c,
+-	0x00000072, 0x80728472,
+-	0xc0211dfc, 0x00000072,
+-	0x80728472, 0xc0211b3c,
+-	0x00000072, 0x80728472,
+-	0xc0211b7c, 0x00000072,
+-	0x80728472, 0xbf8c007f,
+-	0x8671ff71, 0x0000ffff,
+-	0xbefc0073, 0xbefe006e,
+-	0xbeff006f, 0x867375ff,
+-	0x000003ff, 0xb9734803,
+-	0x867375ff, 0xfffff800,
+-	0x8f738b73, 0xb973a2c3,
+-	0xb977f801, 0x8673ff71,
+-	0xf0000000, 0x8f739c73,
+-	0x8e739073, 0xbef60080,
+-	0x87767376, 0x8673ff71,
+-	0x08000000, 0x8f739b73,
+-	0x8e738f73, 0x87767376,
+-	0x8673ff74, 0x00800000,
+-	0x8f739773, 0xb976f807,
+-	0x86fe7e7e, 0x86ea6a6a,
+-	0xb974f802, 0xbf8a0000,
+-	0x95807370, 0xbf810000,
+-};
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+deleted file mode 100644
+index f9e819b..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
++++ /dev/null
+@@ -1,1419 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#if 0
+-HW (GFX9) source code for CWSR trap handler
+-#Version 18 + multiple trap handler
+-
+-// this performance-optimal version was originally from Seven Xu at SRDC
+-
+-// Revison #18	 --...
+-/* Rev History
+-** #1. Branch from gc dv.   //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+-** #4. SR Memory Layout:
+-**			 1. VGPR-SGPR-HWREG-{LDS}
+-**			 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+-** #7. Update: 1. don't barrier if noLDS
+-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+-**	       2. Fix SQ issue by s_sleep 2
+-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+-**	       2. optimize s_buffer save by burst 16sgprs...
+-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+-** #11. Update 1. Add 2 more timestamp for debug version
+-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+-** #13. Integ  1. Always use MUBUF for PV trap shader...
+-** #14. Update 1. s_buffer_store soft clause...
+-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+-**	       2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+-**	       2. FUNC - Handle non-CWSR traps
+-*/
+-
+-var G8SR_WDMEM_HWREG_OFFSET = 0
+-var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
+-
+-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+-
+-var G8SR_DEBUG_TIMESTAMP = 0
+-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4	// ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+-var s_g8sr_ts_save_s	= s[34:35]   // save start
+-var s_g8sr_ts_sq_save_msg  = s[36:37]	// The save shader send SAVEWAVE msg to spi
+-var s_g8sr_ts_spi_wrexec   = s[38:39]	// the SPI write the sr address to SQ
+-var s_g8sr_ts_save_d	= s[40:41]   // save end
+-var s_g8sr_ts_restore_s = s[42:43]   // restore start
+-var s_g8sr_ts_restore_d = s[44:45]   // restore end
+-
+-var G8SR_VGPR_SR_IN_DWX4 = 0
+-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000	 // DWx4 stride is 4*4Bytes
+-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+-
+-
+-/*************************************************************************/
+-/*		    control on how to run the shader			 */
+-/*************************************************************************/
+-//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run)
+-var EMU_RUN_HACK		    =	0
+-var EMU_RUN_HACK_RESTORE_NORMAL	    =	0
+-var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =	0
+-var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =	0
+-var EMU_RUN_HACK_SAVE_FIRST_TIME    =	0		    //for interrupted restore in which the first save is through EMU_RUN_HACK
+-var SAVE_LDS			    =	1
+-var WG_BASE_ADDR_LO		    =	0x9000a000
+-var WG_BASE_ADDR_HI		    =	0x0
+-var WAVE_SPACE			    =	0x5000		    //memory size that each wave occupies in workgroup state mem
+-var CTX_SAVE_CONTROL		    =	0x0
+-var CTX_RESTORE_CONTROL		    =	CTX_SAVE_CONTROL
+-var SIM_RUN_HACK		    =	0		    //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run)
+-var SGPR_SAVE_USE_SQC		    =	1		    //use SQC D$ to do the write
+-var USE_MTBUF_INSTEAD_OF_MUBUF	    =	0		    //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+-var SWIZZLE_EN			    =	0		    //whether we use swizzled buffer addressing
+-var ACK_SQC_STORE		    =	1		    //workaround for suspected SQC store bug causing incorrect stores under concurrency
+-
+-/**************************************************************************/
+-/*			variables					  */
+-/**************************************************************************/
+-var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+-var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+-var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+-var SQ_WAVE_STATUS_HALT_MASK       = 0x2000
+-
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
+-var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT	= 8
+-var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT	= 24
+-var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 3			//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+-
+-var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =	0x400
+-var SQ_WAVE_TRAPSTS_EXCE_MASK	    =	0x1FF			// Exception mask
+-var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =	10
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =	0x100
+-var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =	8
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK	=   0x3FF
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT	=   0x0
+-var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE	=   10
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK	=   0xFFFFF800
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT	=   11
+-var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE	=   21
+-var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK	=   0x800
+-
+-var SQ_WAVE_IB_STS_RCNT_SHIFT		=   16			//FIXME
+-var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=   15			//FIXME
+-var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
+-
+-var SQ_BUF_RSRC_WORD1_ATC_SHIFT	    =	24
+-var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =	27
+-
+-
+-/*	Save	    */
+-var S_SAVE_BUF_RSRC_WORD1_STRIDE	=   0x00040000		//stride is 4 bytes
+-var S_SAVE_BUF_RSRC_WORD3_MISC		=   0x00807FAC		//SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+-
+-var S_SAVE_SPI_INIT_ATC_MASK		=   0x08000000		//bit[27]: ATC bit
+-var S_SAVE_SPI_INIT_ATC_SHIFT		=   27
+-var S_SAVE_SPI_INIT_MTYPE_MASK		=   0x70000000		//bit[30:28]: Mtype
+-var S_SAVE_SPI_INIT_MTYPE_SHIFT		=   28
+-var S_SAVE_SPI_INIT_FIRST_WAVE_MASK	=   0x04000000		//bit[26]: FirstWaveInTG
+-var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=   26
+-
+-var S_SAVE_PC_HI_RCNT_SHIFT		=   28			//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
+-var S_SAVE_PC_HI_RCNT_MASK		=   0xF0000000		//FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT	=   27			//FIXME
+-var S_SAVE_PC_HI_FIRST_REPLAY_MASK	=   0x08000000		//FIXME
+-
+-var s_save_spi_init_lo		    =	exec_lo
+-var s_save_spi_init_hi		    =	exec_hi
+-
+-var s_save_pc_lo	    =	ttmp0		//{TTMP1, TTMP0} = {3��h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+-var s_save_pc_hi	    =	ttmp1
+-var s_save_exec_lo	    =	ttmp2
+-var s_save_exec_hi	    =	ttmp3
+-var s_save_status	    =	ttmp4
+-var s_save_trapsts	    =	ttmp5		//not really used until the end of the SAVE routine
+-var s_save_xnack_mask_lo    =	ttmp6
+-var s_save_xnack_mask_hi    =	ttmp7
+-var s_save_buf_rsrc0	    =	ttmp8
+-var s_save_buf_rsrc1	    =	ttmp9
+-var s_save_buf_rsrc2	    =	ttmp10
+-var s_save_buf_rsrc3	    =	ttmp11
+-
+-var s_save_mem_offset	    =	ttmp14
+-var s_save_alloc_size	    =	s_save_trapsts		//conflict
+-var s_save_tmp		    =	s_save_buf_rsrc2	//shared with s_save_buf_rsrc2	(conflict: should not use mem access with s_save_tmp at the same time)
+-var s_save_m0		    =	ttmp15
+-
+-/*	Restore	    */
+-var S_RESTORE_BUF_RSRC_WORD1_STRIDE	    =	S_SAVE_BUF_RSRC_WORD1_STRIDE
+-var S_RESTORE_BUF_RSRC_WORD3_MISC	    =	S_SAVE_BUF_RSRC_WORD3_MISC
+-
+-var S_RESTORE_SPI_INIT_ATC_MASK		    =	0x08000000	    //bit[27]: ATC bit
+-var S_RESTORE_SPI_INIT_ATC_SHIFT	    =	27
+-var S_RESTORE_SPI_INIT_MTYPE_MASK	    =	0x70000000	    //bit[30:28]: Mtype
+-var S_RESTORE_SPI_INIT_MTYPE_SHIFT	    =	28
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK	    =	0x04000000	    //bit[26]: FirstWaveInTG
+-var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT	    =	26
+-
+-var S_RESTORE_PC_HI_RCNT_SHIFT		    =	S_SAVE_PC_HI_RCNT_SHIFT
+-var S_RESTORE_PC_HI_RCNT_MASK		    =	S_SAVE_PC_HI_RCNT_MASK
+-var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT	    =	S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-var S_RESTORE_PC_HI_FIRST_REPLAY_MASK	    =	S_SAVE_PC_HI_FIRST_REPLAY_MASK
+-
+-var s_restore_spi_init_lo		    =	exec_lo
+-var s_restore_spi_init_hi		    =	exec_hi
+-
+-var s_restore_mem_offset	=   ttmp12
+-var s_restore_alloc_size	=   ttmp3
+-var s_restore_tmp		=   ttmp6
+-var s_restore_mem_offset_save	=   s_restore_tmp	//no conflict
+-
+-var s_restore_m0	    =	s_restore_alloc_size	//no conflict
+-
+-var s_restore_mode	    =	ttmp7
+-
+-var s_restore_pc_lo	    =	ttmp0
+-var s_restore_pc_hi	    =	ttmp1
+-var s_restore_exec_lo	    =	ttmp14
+-var s_restore_exec_hi	    = 	ttmp15
+-var s_restore_status	    =	ttmp4
+-var s_restore_trapsts	    =	ttmp5
+-var s_restore_xnack_mask_lo =	xnack_mask_lo
+-var s_restore_xnack_mask_hi =	xnack_mask_hi
+-var s_restore_buf_rsrc0	    =	ttmp8
+-var s_restore_buf_rsrc1	    =	ttmp9
+-var s_restore_buf_rsrc2	    =	ttmp10
+-var s_restore_buf_rsrc3	    =	ttmp11
+-
+-/**************************************************************************/
+-/*			trap handler entry points			  */
+-/**************************************************************************/
+-/* Shader Main*/
+-
+-shader main
+-  asic(GFX9)
+-  type(CS)
+-
+-
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))		    //hack to use trap_id for determining save/restore
+-	//FIXME VCCZ un-init assertion s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)	    //save STATUS since we will change SCC
+-	s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000		    //change SCC
+-	s_cmp_eq_u32 s_save_tmp, 0x007e0000			    //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+-	s_cbranch_scc0 L_JUMP_TO_RESTORE			    //do not need to recover STATUS here  since we are going to RESTORE
+-	//FIXME	 s_setreg_b32	hwreg(HW_REG_STATUS),	s_save_status	    //need to recover STATUS since we are going to SAVE
+-	s_branch L_SKIP_RESTORE					    //NOT restore, SAVE actually
+-    else
+-	s_branch L_SKIP_RESTORE					    //NOT restore. might be a regular trap or save
+-    end
+-
+-L_JUMP_TO_RESTORE:
+-    s_branch L_RESTORE						    //restore
+-
+-L_SKIP_RESTORE:
+-
+-    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)				    //save STATUS since we will change SCC
+-    s_andn2_b32	    s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK	    //check whether this is for save
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32	    ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+-    s_cbranch_scc1  L_SAVE					//this is the operation for save
+-
+-    // *********    Handle non-CWSR traps	*******************
+-if (!EMU_RUN_HACK)
+-    // Illegal instruction is a non-maskable exception which blocks context save.
+-    // Halt the wavefront and return from the trap.
+-    s_and_b32       ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
+-    s_cbranch_scc1  L_HALT_WAVE
+-
+-    // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA.
+-    // Instead, halt the wavefront and return from the trap.
+-    s_and_b32       ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+-    s_cbranch_scc0  L_NO_MEM_VIOL
+-
+-L_HALT_WAVE:
+-    // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
+-    // We cannot prevent further faults so just terminate the wavefront.
+-    s_and_b32       ttmp8, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+-    s_cbranch_scc0  L_NOT_ALREADY_HALTED
+-    s_endpgm
+-L_NOT_ALREADY_HALTED:
+-    s_or_b32        s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+-    s_branch        L_EXCP_CASE
+-
+-L_NO_MEM_VIOL:
+-    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
+-    s_getreg_b32    ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO)
+-    s_getreg_b32    ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI)
+-    s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
+-    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0
+-    s_waitcnt lgkmcnt(0)
+-    s_or_b32	    ttmp7, ttmp8, ttmp9
+-    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
+-    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+-    s_setpc_b64	    [ttmp8,ttmp9] //jump to next level trap handler
+-
+-L_NO_NEXT_TRAP:
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32	    s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+-    s_cbranch_scc1  L_EXCP_CASE	  // Exception, jump back to the shader program directly.
+-    s_add_u32	    ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
+-    s_addc_u32	ttmp1, ttmp1, 0
+-L_EXCP_CASE:
+-    s_and_b32	ttmp1, ttmp1, 0xFFFF
+-    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+-    s_rfe_b64	    [ttmp0, ttmp1]
+-end
+-    // *********	End handling of non-CWSR traps	 *******************
+-
+-/**************************************************************************/
+-/*			save routine					  */
+-/**************************************************************************/
+-
+-L_SAVE:
+-
+-if G8SR_DEBUG_TIMESTAMP
+-	s_memrealtime	s_g8sr_ts_save_s
+-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+-end
+-
+-    //check whether there is mem_viol
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+-    s_cbranch_scc0  L_NO_PC_REWIND
+-
+-    //if so, need rewind PC assuming GDS operation gets NACKed
+-    s_mov_b32	    s_save_tmp, 0							    //clear mem_viol bit
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
+-    s_and_b32	    s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+-    s_sub_u32	    s_save_pc_lo, s_save_pc_lo, 8	      //pc[31:0]-8
+-    s_subb_u32	    s_save_pc_hi, s_save_pc_hi, 0x0	      // -scc
+-
+-L_NO_PC_REWIND:
+-    s_mov_b32	    s_save_tmp, 0							    //clear saveCtx bit
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	    //clear saveCtx bit
+-
+-    s_mov_b32	    s_save_xnack_mask_lo,   xnack_mask_lo				    //save XNACK_MASK
+-    s_mov_b32	    s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)		    //save RCNT
+-    s_lshl_b32	    s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+-    s_or_b32	    s_save_pc_hi, s_save_pc_hi, s_save_tmp
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
+-    s_lshl_b32	    s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-    s_or_b32	    s_save_pc_hi, s_save_pc_hi, s_save_tmp
+-    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)					    //clear RCNT and FIRST_REPLAY in IB_STS
+-    s_and_b32	    s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+-
+-    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
+-
+-    /*	    inform SPI the readiness and wait for SPI's go signal */
+-    s_mov_b32	    s_save_exec_lo, exec_lo						    //save EXEC and use EXEC for the go signal from SPI
+-    s_mov_b32	    s_save_exec_hi, exec_hi
+-    s_mov_b64	    exec,   0x0								    //clear EXEC to get ready to receive
+-
+-if G8SR_DEBUG_TIMESTAMP
+-	s_memrealtime  s_g8sr_ts_sq_save_msg
+-	s_waitcnt lgkmcnt(0)
+-end
+-
+-    if (EMU_RUN_HACK)
+-
+-    else
+-	s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+-    end
+-
+-  L_SLEEP:
+-    s_sleep 0x2		       // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+-
+-    if (EMU_RUN_HACK)
+-
+-    else
+-	s_cbranch_execz L_SLEEP
+-    end
+-
+-if G8SR_DEBUG_TIMESTAMP
+-	s_memrealtime  s_g8sr_ts_spi_wrexec
+-	s_waitcnt lgkmcnt(0)
+-end
+-
+-    /*	    setup Resource Contants    */
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+-	//calculate wd_addr using absolute thread id
+-	v_readlane_b32 s_save_tmp, v9, 0
+-	s_lshr_b32 s_save_tmp, s_save_tmp, 6
+-	s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+-	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+-	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+-	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+-    else
+-    end
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+-	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+-	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+-	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+-    else
+-    end
+-
+-
+-    s_mov_b32	    s_save_buf_rsrc0,	s_save_spi_init_lo							//base_addr_lo
+-    s_and_b32	    s_save_buf_rsrc1,	s_save_spi_init_hi, 0x0000FFFF						//base_addr_hi
+-    s_or_b32	    s_save_buf_rsrc1,	s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+-    s_mov_b32	    s_save_buf_rsrc2,	0									//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+-    s_mov_b32	    s_save_buf_rsrc3,	S_SAVE_BUF_RSRC_WORD3_MISC
+-    s_and_b32	    s_save_tmp,		s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+-    s_lshr_b32	    s_save_tmp,		s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)	    //get ATC bit into position
+-    s_or_b32	    s_save_buf_rsrc3,	s_save_buf_rsrc3,  s_save_tmp						//or ATC
+-    s_and_b32	    s_save_tmp,		s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+-    s_lshr_b32	    s_save_tmp,		s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)	    //get MTYPE bits into position
+-    s_or_b32	    s_save_buf_rsrc3,	s_save_buf_rsrc3,  s_save_tmp						//or MTYPE
+-
+-    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
+-    s_mov_b32	    s_save_m0,		m0								    //save M0
+-
+-    /*	    global mem offset		*/
+-    s_mov_b32	    s_save_mem_offset,	0x0									//mem offset initial value = 0
+-
+-
+-
+-
+-    /*	    save HW registers	*/
+-    //////////////////////////////
+-
+-  L_SAVE_HWREG:
+-	// HWREG SR memory offset : size(VGPR)+size(SGPR)
+-       get_vgpr_size_bytes(s_save_mem_offset)
+-       get_sgpr_size_bytes(s_save_tmp)
+-       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+-
+-
+-    s_mov_b32	    s_save_buf_rsrc2, 0x4				//NUM_RECORDS	in bytes
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+-    end
+-
+-
+-    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)			//M0
+-
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+-	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
+-	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
+-    end
+-
+-    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)		    //PC
+-    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+-    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)		//EXEC
+-    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+-    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)		//STATUS
+-
+-    //s_save_trapsts conflicts with s_save_alloc_size
+-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+-    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)		//TRAPSTS
+-
+-    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_LO
+-    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)	    //XNACK_MASK_HI
+-
+-    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+-    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)						    //MODE
+-    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+-
+-
+-
+-    /*	    the first wave in the threadgroup	 */
+-    s_and_b32	    s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK	// extract fisrt wave bit
+-    s_mov_b32	     s_save_exec_hi, 0x0
+-    s_or_b32	     s_save_exec_hi, s_save_tmp, s_save_exec_hi				 // save first wave bit to s_save_exec_hi.bits[26]
+-
+-
+-    /*		save SGPRs	*/
+-	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+-    //////////////////////////////
+-
+-    // SGPR SR memory offset : size(VGPR)
+-    get_vgpr_size_bytes(s_save_mem_offset)
+-    // TODO, change RSRC word to rearrange memory layout for SGPRS
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)		//spgr_size
+-    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
+-    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+-
+-    if (SGPR_SAVE_USE_SQC)
+-	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 2		    //NUM_RECORDS in bytes
+-    else
+-	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 8		    //NUM_RECORDS in bytes (64 threads)
+-    end
+-
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+-    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+-    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+-    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+-    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+-
+-    s_mov_b32	    m0, 0x0			    //SGPR initial index value =0
+-    s_nop	    0x0				    //Manually inserted wait states
+-  L_SAVE_SGPR_LOOP:
+-    // SGPR is allocated in 16 SGPR granularity
+-    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
+-    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
+-    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
+-    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
+-    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
+-    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
+-    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
+-    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
+-
+-    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+-    s_add_u32	    m0, m0, 16							    //next sgpr index
+-    s_cmp_lt_u32    m0, s_save_alloc_size					    //scc = (m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_SGPR_LOOP					//SGPR save is complete?
+-    // restore s_save_buf_rsrc0,1
+-    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+-    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+-
+-
+-
+-
+-    /*		save first 4 VGPR, then LDS save could use   */
+-	// each wave will alloc 4 vgprs at least...
+-    /////////////////////////////////////////////////////////////////////////////////////
+-
+-    s_mov_b32	    s_save_mem_offset, 0
+-    s_mov_b32	    exec_lo, 0xFFFFFFFF						    //need every thread from now on
+-    s_mov_b32	    exec_hi, 0xFFFFFFFF
+-
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // VGPR Allocated in 4-GPR granularity
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-	// the const stride for DWx4 is 4*4 bytes
+-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-
+-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+-else
+-	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+-	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+-	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+-end
+-
+-
+-
+-    /*		save LDS	*/
+-    //////////////////////////////
+-
+-  L_SAVE_LDS:
+-
+-	// Change EXEC to all threads...
+-    s_mov_b32	    exec_lo, 0xFFFFFFFF	  //need every thread from now on
+-    s_mov_b32	    exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		    //lds_size
+-    s_and_b32	    s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF		    //lds_size is zero?
+-    s_cbranch_scc0  L_SAVE_LDS_DONE									       //no lds used? jump to L_SAVE_DONE
+-
+-    s_barrier		    //LDS is used? wait for other waves in the same TG
+-    s_and_b32	    s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK		       //exec is still used here
+-    s_cbranch_scc0  L_SAVE_LDS_DONE
+-
+-	// first wave do LDS save;
+-
+-    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 6			    //LDS size in dwords = lds_size * 64dw
+-    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 2			    //LDS size in bytes
+-    s_mov_b32	    s_save_buf_rsrc2,  s_save_alloc_size			    //NUM_RECORDS in bytes
+-
+-    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+-    //
+-    get_vgpr_size_bytes(s_save_mem_offset)
+-    get_sgpr_size_bytes(s_save_tmp)
+-    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
+-    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+-
+-
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0	      //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_save_buf_rsrc2,  0x1000000		      //NUM_RECORDS in bytes
+-    end
+-
+-    s_mov_b32	    m0, 0x0						  //lds_offset initial value = 0
+-
+-
+-var LDS_DMA_ENABLE = 0
+-var UNROLL = 0
+-if UNROLL==0 && LDS_DMA_ENABLE==1
+-	s_mov_b32  s3, 256*2
+-	s_nop 0
+-	s_nop 0
+-	s_nop 0
+-  L_SAVE_LDS_LOOP:
+-	//TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+-    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
+-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1		// first 64DW
+-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+-    end
+-
+-    s_add_u32	    m0, m0, s3						//every buffer_store_lds does 256 bytes
+-    s_add_u32	    s_save_mem_offset, s_save_mem_offset, s3				//mem offset increased by 256 bytes
+-    s_cmp_lt_u32    m0, s_save_alloc_size						//scc=(m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_LDS_LOOP							//LDS save is complete?
+-
+-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL	, has ichace miss
+-      // store from higest LDS address to lowest
+-      s_mov_b32	 s3, 256*2
+-      s_sub_u32	 m0, s_save_alloc_size, s3
+-      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+-      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
+-      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
+-      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
+-      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
+-      s_nop 0
+-      s_nop 0
+-      s_nop 0	//pad 3 dw to let LDS_DMA align with 64Bytes
+-      s_getpc_b64 s[0:1]			      // reuse s[0:1], since s[0:1] already saved
+-      s_add_u32	  s0, s0,s_save_alloc_size
+-      s_addc_u32  s1, s1, 0
+-      s_setpc_b64 s[0:1]
+-
+-
+-       for var i =0; i< 128; i++
+-	    // be careful to make here a 64Byte aligned address, which could improve performance...
+-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0		// first 64DW
+-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256		  // second 64DW
+-
+-	if i!=127
+-	s_sub_u32  m0, m0, s3	   // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
+-	    s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
+-	    end
+-       end
+-
+-else   // BUFFER_STORE
+-      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+-      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2	// tid
+-      v_mul_i32_i24 v2, v3, 8	// tid*8
+-      v_mov_b32 v3, 256*2
+-      s_mov_b32 m0, 0x10000
+-      s_mov_b32 s0, s_save_buf_rsrc3
+-      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF	  // disable add_tid
+-      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
+-
+-L_SAVE_LDS_LOOP_VECTOR:
+-      ds_read_b64 v[0:1], v2	//x =LDS[a], byte address
+-      s_waitcnt lgkmcnt(0)
+-      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
+-//	s_waitcnt vmcnt(0)
+-//	v_add_u32 v2, vcc[0:1], v2, v3
+-      v_add_u32 v2, v2, v3
+-      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+-      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+-
+-      // restore rsrc3
+-      s_mov_b32 s_save_buf_rsrc3, s0
+-
+-end
+-
+-L_SAVE_LDS_DONE:
+-
+-
+-    /*		save VGPRs  - set the Rest VGPRs	*/
+-    //////////////////////////////////////////////////////////////////////////////////////
+-  L_SAVE_VGPR:
+-    // VGPR SR memory offset: 0
+-    // TODO rearrange the RSRC words to use swizzle for VGPR save...
+-
+-    s_mov_b32	    s_save_mem_offset, (0+256*4)				    // for the rest VGPRs
+-    s_mov_b32	    exec_lo, 0xFFFFFFFF						    //need every thread from now on
+-    s_mov_b32	    exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)		    //vpgr_size
+-    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
+-    s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)	  //FIXME for GFX, zero is possible
+-    s_lshl_b32	    s_save_buf_rsrc2,  s_save_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
+-    end
+-
+-
+-    // VGPR Allocated in 4-GPR granularity
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-	// the const stride for DWx4 is 4*4 bytes
+-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-	s_mov_b32	  m0, 4	    // skip first 4 VGPRs
+-	s_cmp_lt_u32	  m0, s_save_alloc_size
+-	s_cbranch_scc0	  L_SAVE_VGPR_LOOP_END	    // no more vgprs
+-
+-	s_set_gpr_idx_on  m0, 0x1   // This will change M0
+-	s_add_u32	  s_save_alloc_size, s_save_alloc_size, 0x1000	// because above inst change m0
+-L_SAVE_VGPR_LOOP:
+-	v_mov_b32	  v0, v0   // v0 = v[0+m0]
+-	v_mov_b32	  v1, v1
+-	v_mov_b32	  v2, v2
+-	v_mov_b32	  v3, v3
+-
+-
+-	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-	s_add_u32	  m0, m0, 4
+-	s_add_u32	  s_save_mem_offset, s_save_mem_offset, 256*4
+-	s_cmp_lt_u32	  m0, s_save_alloc_size
+-    s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
+-    s_set_gpr_idx_off
+-L_SAVE_VGPR_LOOP_END:
+-
+-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+-else
+-    // VGPR store using dw burst
+-    s_mov_b32	      m0, 0x4	//VGPR initial index value =0
+-    s_cmp_lt_u32      m0, s_save_alloc_size
+-    s_cbranch_scc0    L_SAVE_VGPR_END
+-
+-
+-    s_set_gpr_idx_on	m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+-    s_add_u32	    s_save_alloc_size, s_save_alloc_size, 0x1000		    //add 0x1000 since we compare m0 against it later
+-
+-  L_SAVE_VGPR_LOOP:
+-    v_mov_b32	    v0, v0		//v0 = v[0+m0]
+-    v_mov_b32	    v1, v1		//v0 = v[0+m0]
+-    v_mov_b32	    v2, v2		//v0 = v[0+m0]
+-    v_mov_b32	    v3, v3		//v0 = v[0+m0]
+-
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-	tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+-	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+-	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+-	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+-    end
+-
+-    s_add_u32	    m0, m0, 4							    //next vgpr index
+-    s_add_u32	    s_save_mem_offset, s_save_mem_offset, 256*4			    //every buffer_store_dword does 256 bytes
+-    s_cmp_lt_u32    m0, s_save_alloc_size					    //scc = (m0 < s_save_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
+-    s_set_gpr_idx_off
+-end
+-
+-L_SAVE_VGPR_END:
+-
+-
+-
+-
+-
+-
+-    /*	   S_PGM_END_SAVED  */				    //FIXME  graphics ONLY
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+-	s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+-	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
+-	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
+-	s_rfe_b64 s_save_pc_lo				    //Return to the main shader program
+-    else
+-    end
+-
+-// Save Done timestamp
+-if G8SR_DEBUG_TIMESTAMP
+-	s_memrealtime	s_g8sr_ts_save_d
+-	// SGPR SR memory offset : size(VGPR)
+-	get_vgpr_size_bytes(s_save_mem_offset)
+-	s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+-	// Need reset rsrc2??
+-	s_mov_b32 m0, s_save_mem_offset
+-	s_mov_b32 s_save_buf_rsrc2,  0x1000000					//NUM_RECORDS in bytes
+-	s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0	    glc:1
+-end
+-
+-
+-    s_branch	L_END_PGM
+-
+-
+-
+-/**************************************************************************/
+-/*			restore routine					  */
+-/**************************************************************************/
+-
+-L_RESTORE:
+-    /*	    Setup Resource Contants    */
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+-	//calculate wd_addr using absolute thread id
+-	v_readlane_b32 s_restore_tmp, v9, 0
+-	s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+-	s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+-	s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+-	s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+-	s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+-    else
+-    end
+-
+-if G8SR_DEBUG_TIMESTAMP
+-	s_memrealtime	s_g8sr_ts_restore_s
+-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
+-	// tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+-	s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+-	s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
+-end
+-
+-
+-
+-    s_mov_b32	    s_restore_buf_rsrc0,    s_restore_spi_init_lo							    //base_addr_lo
+-    s_and_b32	    s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF						    //base_addr_hi
+-    s_or_b32	    s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
+-    s_mov_b32	    s_restore_buf_rsrc2,    0										    //NUM_RECORDS initial value = 0 (in bytes)
+-    s_mov_b32	    s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
+-    s_and_b32	    s_restore_tmp,	    s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+-    s_lshr_b32	    s_restore_tmp,	    s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)	    //get ATC bit into position
+-    s_or_b32	    s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp						    //or ATC
+-    s_and_b32	    s_restore_tmp,	    s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+-    s_lshr_b32	    s_restore_tmp,	    s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
+-    s_or_b32	    s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp						    //or MTYPE
+-
+-    /*	    global mem offset		*/
+-//  s_mov_b32	    s_restore_mem_offset, 0x0				    //mem offset initial value = 0
+-
+-    /*	    the first wave in the threadgroup	 */
+-    s_and_b32	    s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+-    s_cbranch_scc0  L_RESTORE_VGPR
+-
+-    /*		restore LDS	*/
+-    //////////////////////////////
+-  L_RESTORE_LDS:
+-
+-    s_mov_b32	    exec_lo, 0xFFFFFFFF							    //need every thread from now on   //be consistent with SAVE although can be moved ahead
+-    s_mov_b32	    exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		//lds_size
+-    s_and_b32	    s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF		    //lds_size is zero?
+-    s_cbranch_scc0  L_RESTORE_VGPR							    //no lds used? jump to L_RESTORE_VGPR
+-    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 6			    //LDS size in dwords = lds_size * 64dw
+-    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 2			    //LDS size in bytes
+-    s_mov_b32	    s_restore_buf_rsrc2,    s_restore_alloc_size			    //NUM_RECORDS in bytes
+-
+-    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+-    //
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()	     //FIXME, Check if offset overflow???
+-
+-
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+-    end
+-    s_mov_b32	    m0, 0x0								    //lds_offset initial value = 0
+-
+-  L_RESTORE_LDS_LOOP:
+-    if (SAVE_LDS)
+-	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1		       // first 64DW
+-	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256	       // second 64DW
+-    end
+-    s_add_u32	    m0, m0, 256*2						// 128 DW
+-    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*2		//mem offset increased by 128DW
+-    s_cmp_lt_u32    m0, s_restore_alloc_size					//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_RESTORE_LDS_LOOP							    //LDS restore is complete?
+-
+-
+-    /*		restore VGPRs	    */
+-    //////////////////////////////
+-  L_RESTORE_VGPR:
+-	// VGPR SR memory offset : 0
+-    s_mov_b32	    s_restore_mem_offset, 0x0
+-    s_mov_b32	    exec_lo, 0xFFFFFFFF							    //need every thread from now on   //be consistent with SAVE although can be moved ahead
+-    s_mov_b32	    exec_hi, 0xFFFFFFFF
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)	//vpgr_size
+-    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
+-    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+-    s_lshl_b32	    s_restore_buf_rsrc2,  s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+-    end
+-
+-if G8SR_VGPR_SR_IN_DWX4
+-     get_vgpr_size_bytes(s_restore_mem_offset)
+-     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
+-
+-     // the const stride for DWx4 is 4*4 bytes
+-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+-
+-     s_mov_b32	       m0, s_restore_alloc_size
+-     s_set_gpr_idx_on  m0, 0x8	  // Note.. This will change m0
+-
+-L_RESTORE_VGPR_LOOP:
+-     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+-     s_waitcnt vmcnt(0)
+-     s_sub_u32	       m0, m0, 4
+-     v_mov_b32	       v0, v0	// v[0+m0] = v0
+-     v_mov_b32	       v1, v1
+-     v_mov_b32	       v2, v2
+-     v_mov_b32	       v3, v3
+-     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
+-     s_cmp_eq_u32      m0, 0x8000
+-     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
+-     s_set_gpr_idx_off
+-
+-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
+-
+-else
+-    // VGPR load using dw burst
+-    s_mov_b32	    s_restore_mem_offset_save, s_restore_mem_offset	// restore start with v1, v0 will be the last
+-    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*4
+-    s_mov_b32	    m0, 4				//VGPR initial index value = 1
+-    s_set_gpr_idx_on  m0, 0x8			    //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+-    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 0x8000			    //add 0x8000 since we compare m0 against it later
+-
+-  L_RESTORE_VGPR_LOOP:
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+-	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+-	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+-	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+-    end
+-    s_waitcnt	    vmcnt(0)								    //ensure data ready
+-    v_mov_b32	    v0, v0								    //v[0+m0] = v0
+-    v_mov_b32	    v1, v1
+-    v_mov_b32	    v2, v2
+-    v_mov_b32	    v3, v3
+-    s_add_u32	    m0, m0, 4								    //next vgpr index
+-    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*4				//every buffer_load_dword does 256 bytes
+-    s_cmp_lt_u32    m0, s_restore_alloc_size						    //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc1  L_RESTORE_VGPR_LOOP							    //VGPR restore (except v0) is complete?
+-    s_set_gpr_idx_off
+-											    /* VGPR restore on v0 */
+-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+-	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+-    else
+-	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
+-	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
+-	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
+-	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
+-    end
+-
+-end
+-
+-    /*		restore SGPRs	    */
+-    //////////////////////////////
+-
+-    // SGPR SR memory offset : size(VGPR)
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4	   // restore SGPR from S[n] to S[0], by 16 sgprs group
+-    // TODO, change RSRC word to rearrange memory layout for SGPRS
+-
+-    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)		    //spgr_size
+-    s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
+-    s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+-
+-    if (SGPR_SAVE_USE_SQC)
+-	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 2			    //NUM_RECORDS in bytes
+-    else
+-	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads)
+-    end
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+-    end
+-
+-    s_mov_b32 m0, s_restore_alloc_size
+-
+- L_RESTORE_SGPR_LOOP:
+-    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)	 //PV: further performance improvement can be made
+-    s_waitcnt	    lgkmcnt(0)								    //ensure data ready
+-
+-    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
+-    s_nop 0 // hazard SALU M0=> S_MOVREL
+-
+-    s_movreld_b64   s0, s0	//s[0+m0] = s0
+-    s_movreld_b64   s2, s2
+-    s_movreld_b64   s4, s4
+-    s_movreld_b64   s6, s6
+-    s_movreld_b64   s8, s8
+-    s_movreld_b64   s10, s10
+-    s_movreld_b64   s12, s12
+-    s_movreld_b64   s14, s14
+-
+-    s_cmp_eq_u32    m0, 0		//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+-    s_cbranch_scc0  L_RESTORE_SGPR_LOOP		    //SGPR restore (except s0) is complete?
+-
+-    /*	    restore HW registers    */
+-    //////////////////////////////
+-  L_RESTORE_HWREG:
+-
+-
+-if G8SR_DEBUG_TIMESTAMP
+-      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+-      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+-end
+-
+-    // HWREG SR memory offset : size(VGPR)+size(SGPR)
+-    get_vgpr_size_bytes(s_restore_mem_offset)
+-    get_sgpr_size_bytes(s_restore_tmp)
+-    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+-
+-
+-    s_mov_b32	    s_restore_buf_rsrc2, 0x4						    //NUM_RECORDS   in bytes
+-    if (SWIZZLE_EN)
+-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
+-    else
+-	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
+-    end
+-
+-    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)		    //M0
+-    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		//PC
+-    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+-    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		    //EXEC
+-    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+-    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)		    //STATUS
+-    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)		    //TRAPSTS
+-    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		    //XNACK_MASK_LO
+-    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)		    //XNACK_MASK_HI
+-    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)		//MODE
+-
+-    s_waitcnt	    lgkmcnt(0)											    //from now on, it is safe to restore STATUS and IB_STS
+-
+-    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff	//pc[47:32]	   //Do it here in order not to affect STATUS
+-
+-    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+-	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8		 //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
+-	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
+-    end
+-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+-	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4		 //pc[31:0]+4	  // save is hack through s_trap but restore is normal
+-	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
+-    end
+-
+-    s_mov_b32	    m0,		s_restore_m0
+-    s_mov_b32	    exec_lo,	s_restore_exec_lo
+-    s_mov_b32	    exec_hi,	s_restore_exec_hi
+-
+-    s_and_b32	    s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+-    s_and_b32	    s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+-    s_lshr_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+-    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts	   //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+-    s_setreg_b32    hwreg(HW_REG_MODE),	    s_restore_mode
+-    //reuse s_restore_m0 as a temp register
+-    s_and_b32	    s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+-    s_lshr_b32	    s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+-    s_lshl_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+-    s_mov_b32	    s_restore_tmp, 0x0										    //IB_STS is zero
+-    s_or_b32	    s_restore_tmp, s_restore_tmp, s_restore_m0
+-    s_and_b32	    s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+-    s_lshr_b32	    s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+-    s_lshl_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+-    s_or_b32	    s_restore_tmp, s_restore_tmp, s_restore_m0
+-    s_and_b32	    s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+-    s_lshr_b32	    s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+-    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+-
+-    s_and_b64	 exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+-    s_and_b64	 vcc, vcc, vcc	// Restore STATUS.VCCZ, not writable by s_setreg_b32
+-    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status	 // SCC is included, which is changed by previous salu
+-
+-    s_barrier							//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time
+-
+-if G8SR_DEBUG_TIMESTAMP
+-    s_memrealtime s_g8sr_ts_restore_d
+-    s_waitcnt lgkmcnt(0)
+-end
+-
+-//  s_rfe_b64 s_restore_pc_lo					//Return to the main shader program and resume execution
+-    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0		// s_restore_m0[0] is used to set STATUS.inst_atc
+-
+-
+-/**************************************************************************/
+-/*			the END						  */
+-/**************************************************************************/
+-L_END_PGM:
+-    s_endpgm
+-
+-end
+-
+-
+-/**************************************************************************/
+-/*			the helper functions				  */
+-/**************************************************************************/
+-
+-//Only for save hwreg to mem
+-function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+-	s_mov_b32 exec_lo, m0			//assuming exec_lo is not needed anymore from this point on
+-	s_mov_b32 m0, s_mem_offset
+-	s_buffer_store_dword s, s_rsrc, m0	glc:1
+-if ACK_SQC_STORE
+-	s_waitcnt lgkmcnt(0)
+-end
+-	s_add_u32	s_mem_offset, s_mem_offset, 4
+-	s_mov_b32   m0, exec_lo
+-end
+-
+-
+-// HWREG are saved before SGPRs, so all HWREG could be use.
+-function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+-
+-	s_buffer_store_dwordx4 s[0], s_rsrc, 0	glc:1
+-if ACK_SQC_STORE
+-	s_waitcnt lgkmcnt(0)
+-end
+-	s_buffer_store_dwordx4 s[4], s_rsrc, 16	 glc:1
+-if ACK_SQC_STORE
+-	s_waitcnt lgkmcnt(0)
+-end
+-	s_buffer_store_dwordx4 s[8], s_rsrc, 32	 glc:1
+-if ACK_SQC_STORE
+-	s_waitcnt lgkmcnt(0)
+-end
+-	s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+-if ACK_SQC_STORE
+-	s_waitcnt lgkmcnt(0)
+-end
+-	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+-	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0	      // +scc
+-end
+-
+-
+-function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+-    s_buffer_load_dword s, s_rsrc, s_mem_offset	    glc:1
+-    s_add_u32	    s_mem_offset, s_mem_offset, 4
+-end
+-
+-function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+-    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset	glc:1
+-    s_sub_u32	    s_mem_offset, s_mem_offset, 4*16
+-end
+-
+-
+-
+-function get_lds_size_bytes(s_lds_size_byte)
+-    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+-    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)		// lds_size
+-    s_lshl_b32	   s_lds_size_byte, s_lds_size_byte, 8			    //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
+-end
+-
+-function get_vgpr_size_bytes(s_vgpr_size_byte)
+-    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)	 //vpgr_size
+-    s_add_u32	   s_vgpr_size_byte, s_vgpr_size_byte, 1
+-    s_lshl_b32	   s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4	(non-zero value)   //FIXME for GFX, zero is possible
+-end
+-
+-function get_sgpr_size_bytes(s_sgpr_size_byte)
+-    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)	 //spgr_size
+-    s_add_u32	   s_sgpr_size_byte, s_sgpr_size_byte, 1
+-    s_lshl_b32	   s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
+-end
+-
+-function get_hwreg_size_bytes
+-    return 128 //HWREG size 128 bytes
+-end
+-
+-
+-
+-#endif
+-
+-static const uint32_t cwsr_trap_gfx9_hex[] = {
+-	0xbf820001, 0xbf820136,
+-	0xb8f0f802, 0x89708670,
+-	0xb8f1f803, 0x8674ff71,
+-	0x00000400, 0xbf850021,
+-	0x8674ff71, 0x00000800,
+-	0xbf850003, 0x8674ff71,
+-	0x00000100, 0xbf840007,
+-	0x8674ff70, 0x00002000,
+-	0xbf840001, 0xbf810000,
+-	0x8770ff70, 0x00002000,
+-	0xbf820010, 0xb8faf812,
+-	0xb8fbf813, 0x8efa887a,
+-	0xc00a1d3d, 0x00000000,
+-	0xbf8cc07f, 0x87737574,
+-	0xbf840002, 0xb970f802,
+-	0xbe801d74, 0xb8f1f803,
+-	0x8671ff71, 0x000001ff,
+-	0xbf850002, 0x806c846c,
+-	0x826d806d, 0x866dff6d,
+-	0x0000ffff, 0xb970f802,
+-	0xbe801f6c, 0xb8f1f803,
+-	0x8671ff71, 0x00000100,
+-	0xbf840006, 0xbef60080,
+-	0xb9760203, 0x866dff6d,
+-	0x0000ffff, 0x80ec886c,
+-	0x82ed806d, 0xbef60080,
+-	0xb9760283, 0xbef20068,
+-	0xbef30069, 0xb8f62407,
+-	0x8e769c76, 0x876d766d,
+-	0xb8f603c7, 0x8e769b76,
+-	0x876d766d, 0xb8f6f807,
+-	0x8676ff76, 0x00007fff,
+-	0xb976f807, 0xbeee007e,
+-	0xbeef007f, 0xbefe0180,
+-	0xbf900004, 0xbf8e0002,
+-	0xbf88fffe, 0xbef4007e,
+-	0x8675ff7f, 0x0000ffff,
+-	0x8775ff75, 0x00040000,
+-	0xbef60080, 0xbef700ff,
+-	0x00807fac, 0x8676ff7f,
+-	0x08000000, 0x8f768376,
+-	0x87777677, 0x8676ff7f,
+-	0x70000000, 0x8f768176,
+-	0x87777677, 0xbefb007c,
+-	0xbefa0080, 0xb8fa2a05,
+-	0x807a817a, 0x8e7a8a7a,
+-	0xb8f61605, 0x80768176,
+-	0x8e768676, 0x807a767a,
+-	0xbef60084, 0xbef600ff,
+-	0x01000000, 0xbefe007c,
+-	0xbefc007a, 0xc0611efa,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611b3a, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611b7a,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611bba, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611bfa,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611c3a, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xb8f1f803,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611c7a, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xbefe007c,
+-	0xbefc007a, 0xc0611cba,
+-	0x0000007c, 0xbf8cc07f,
+-	0x807a847a, 0xbefc007e,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611cfa, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0xb8fbf801,
+-	0xbefe007c, 0xbefc007a,
+-	0xc0611efa, 0x0000007c,
+-	0xbf8cc07f, 0x807a847a,
+-	0xbefc007e, 0x8676ff7f,
+-	0x04000000, 0xbeef0080,
+-	0x876f6f76, 0xb8fa2a05,
+-	0x807a817a, 0x8e7a8a7a,
+-	0xb8f11605, 0x80718171,
+-	0x8e718471, 0x8e768271,
+-	0xbef600ff, 0x01000000,
+-	0xbef20174, 0x80747a74,
+-	0x82758075, 0xbefc0080,
+-	0xbf800000, 0xbe802b00,
+-	0xbe822b02, 0xbe842b04,
+-	0xbe862b06, 0xbe882b08,
+-	0xbe8a2b0a, 0xbe8c2b0c,
+-	0xbe8e2b0e, 0xc06b003a,
+-	0x00000000, 0xbf8cc07f,
+-	0xc06b013a, 0x00000010,
+-	0xbf8cc07f, 0xc06b023a,
+-	0x00000020, 0xbf8cc07f,
+-	0xc06b033a, 0x00000030,
+-	0xbf8cc07f, 0x8074c074,
+-	0x82758075, 0x807c907c,
+-	0xbf0a717c, 0xbf85ffe7,
+-	0xbef40172, 0xbefa0080,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xbef600ff, 0x01000000,
+-	0xe0724000, 0x7a1d0000,
+-	0xe0724100, 0x7a1d0100,
+-	0xe0724200, 0x7a1d0200,
+-	0xe0724300, 0x7a1d0300,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8f14306, 0x8671c171,
+-	0xbf84002c, 0xbf8a0000,
+-	0x8676ff6f, 0x04000000,
+-	0xbf840028, 0x8e718671,
+-	0x8e718271, 0xbef60071,
+-	0xb8fa2a05, 0x807a817a,
+-	0x8e7a8a7a, 0xb8f61605,
+-	0x80768176, 0x8e768676,
+-	0x807a767a, 0x807aff7a,
+-	0x00000080, 0xbef600ff,
+-	0x01000000, 0xbefc0080,
+-	0xd28c0002, 0x000100c1,
+-	0xd28d0003, 0x000204c1,
+-	0xd1060002, 0x00011103,
+-	0x7e0602ff, 0x00000200,
+-	0xbefc00ff, 0x00010000,
+-	0xbe800077, 0x8677ff77,
+-	0xff7fffff, 0x8777ff77,
+-	0x00058000, 0xd8ec0000,
+-	0x00000002, 0xbf8cc07f,
+-	0xe0765000, 0x7a1d0002,
+-	0x68040702, 0xd0c9006a,
+-	0x0000e302, 0xbf87fff7,
+-	0xbef70000, 0xbefa00ff,
+-	0x00000400, 0xbefe00c1,
+-	0xbeff00c1, 0xb8f12a05,
+-	0x80718171, 0x8e718271,
+-	0x8e768871, 0xbef600ff,
+-	0x01000000, 0xbefc0084,
+-	0xbf0a717c, 0xbf840015,
+-	0xbf11017c, 0x8071ff71,
+-	0x00001000, 0x7e000300,
+-	0x7e020301, 0x7e040302,
+-	0x7e060303, 0xe0724000,
+-	0x7a1d0000, 0xe0724100,
+-	0x7a1d0100, 0xe0724200,
+-	0x7a1d0200, 0xe0724300,
+-	0x7a1d0300, 0x807c847c,
+-	0x807aff7a, 0x00000400,
+-	0xbf0a717c, 0xbf85ffef,
+-	0xbf9c0000, 0xbf8200c5,
+-	0xbef4007e, 0x8675ff7f,
+-	0x0000ffff, 0x8775ff75,
+-	0x00040000, 0xbef60080,
+-	0xbef700ff, 0x00807fac,
+-	0x8672ff7f, 0x08000000,
+-	0x8f728372, 0x87777277,
+-	0x8672ff7f, 0x70000000,
+-	0x8f728172, 0x87777277,
+-	0x8672ff7f, 0x04000000,
+-	0xbf84001e, 0xbefe00c1,
+-	0xbeff00c1, 0xb8ef4306,
+-	0x866fc16f, 0xbf840019,
+-	0x8e6f866f, 0x8e6f826f,
+-	0xbef6006f, 0xb8f82a05,
+-	0x80788178, 0x8e788a78,
+-	0xb8f21605, 0x80728172,
+-	0x8e728672, 0x80787278,
+-	0x8078ff78, 0x00000080,
+-	0xbef600ff, 0x01000000,
+-	0xbefc0080, 0xe0510000,
+-	0x781d0000, 0xe0510100,
+-	0x781d0000, 0x807cff7c,
+-	0x00000200, 0x8078ff78,
+-	0x00000200, 0xbf0a6f7c,
+-	0xbf85fff6, 0xbef80080,
+-	0xbefe00c1, 0xbeff00c1,
+-	0xb8ef2a05, 0x806f816f,
+-	0x8e6f826f, 0x8e76886f,
+-	0xbef600ff, 0x01000000,
+-	0xbef20078, 0x8078ff78,
+-	0x00000400, 0xbefc0084,
+-	0xbf11087c, 0x806fff6f,
+-	0x00008000, 0xe0524000,
+-	0x781d0000, 0xe0524100,
+-	0x781d0100, 0xe0524200,
+-	0x781d0200, 0xe0524300,
+-	0x781d0300, 0xbf8c0f70,
+-	0x7e000300, 0x7e020301,
+-	0x7e040302, 0x7e060303,
+-	0x807c847c, 0x8078ff78,
+-	0x00000400, 0xbf0a6f7c,
+-	0xbf85ffee, 0xbf9c0000,
+-	0xe0524000, 0x721d0000,
+-	0xe0524100, 0x721d0100,
+-	0xe0524200, 0x721d0200,
+-	0xe0524300, 0x721d0300,
+-	0xb8f82a05, 0x80788178,
+-	0x8e788a78, 0xb8f21605,
+-	0x80728172, 0x8e728672,
+-	0x80787278, 0x80f8c078,
+-	0xb8ef1605, 0x806f816f,
+-	0x8e6f846f, 0x8e76826f,
+-	0xbef600ff, 0x01000000,
+-	0xbefc006f, 0xc031003a,
+-	0x00000078, 0x80f8c078,
+-	0xbf8cc07f, 0x80fc907c,
+-	0xbf800000, 0xbe802d00,
+-	0xbe822d02, 0xbe842d04,
+-	0xbe862d06, 0xbe882d08,
+-	0xbe8a2d0a, 0xbe8c2d0c,
+-	0xbe8e2d0e, 0xbf06807c,
+-	0xbf84fff0, 0xb8f82a05,
+-	0x80788178, 0x8e788a78,
+-	0xb8f21605, 0x80728172,
+-	0x8e728672, 0x80787278,
+-	0xbef60084, 0xbef600ff,
+-	0x01000000, 0xc0211bfa,
+-	0x00000078, 0x80788478,
+-	0xc0211b3a, 0x00000078,
+-	0x80788478, 0xc0211b7a,
+-	0x00000078, 0x80788478,
+-	0xc0211eba, 0x00000078,
+-	0x80788478, 0xc0211efa,
+-	0x00000078, 0x80788478,
+-	0xc0211c3a, 0x00000078,
+-	0x80788478, 0xc0211c7a,
+-	0x00000078, 0x80788478,
+-	0xc0211a3a, 0x00000078,
+-	0x80788478, 0xc0211a7a,
+-	0x00000078, 0x80788478,
+-	0xc0211cfa, 0x00000078,
+-	0x80788478, 0xbf8cc07f,
+-	0x866dff6d, 0x0000ffff,
+-	0xbefc006f, 0xbefe007a,
+-	0xbeff007b, 0x866f71ff,
+-	0x000003ff, 0xb96f4803,
+-	0x866f71ff, 0xfffff800,
+-	0x8f6f8b6f, 0xb96fa2c3,
+-	0xb973f801, 0x866fff6d,
+-	0xf0000000, 0x8f6f9c6f,
+-	0x8e6f906f, 0xbef20080,
+-	0x87726f72, 0x866fff6d,
+-	0x08000000, 0x8f6f9b6f,
+-	0x8e6f8f6f, 0x87726f72,
+-	0x866fff70, 0x00800000,
+-	0x8f6f976f, 0xb972f807,
+-	0x86fe7e7e, 0x86ea6a6a,
+-	0xb970f802, 0xbf8a0000,
+-	0x95806f6c, 0xbf810000,
+-};
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 933af56..660b3fb 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -25,7 +25,6 @@
+ #include <linux/err.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+-#include <linux/sched/mm.h>
+ #include <linux/slab.h>
+ #include <linux/uaccess.h>
+ #include <linux/compat.h>
+@@ -34,17 +33,13 @@
+ #include <linux/mm.h>
+ #include <linux/mman.h>
+ #include <asm/processor.h>
+-#include <linux/ptrace.h>
+-
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_dbgmgr.h"
+-#include "kfd_ipc.h"
+ 
+ static long kfd_ioctl(struct file *, unsigned int, unsigned long);
+ static int kfd_open(struct inode *, struct file *);
+ static int kfd_mmap(struct file *, struct vm_area_struct *);
+-static bool kfd_dev_is_large_bar(struct kfd_dev *dev);
+ 
+ static const char kfd_dev_name[] = "kfd";
+ 
+@@ -60,14 +55,6 @@ static int kfd_char_dev_major = -1;
+ static struct class *kfd_class;
+ struct device *kfd_device;
+ 
+-static char *kfd_devnode(struct device *dev, umode_t *mode)
+-{
+-	if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0))
+-		*mode = 0666;
+-
+-	return NULL;
+-}
+-
+ int kfd_chardev_init(void)
+ {
+ 	int err = 0;
+@@ -82,8 +69,6 @@ int kfd_chardev_init(void)
+ 	if (IS_ERR(kfd_class))
+ 		goto err_class_create;
+ 
+-	kfd_class->devnode = kfd_devnode;
+-
+ 	kfd_device = device_create(kfd_class, NULL,
+ 					MKDEV(kfd_char_dev_major, 0),
+ 					NULL, kfd_dev_name);
+@@ -132,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
+ 		return -EPERM;
+ 	}
+ 
+-	process = kfd_create_process(filep);
++	process = kfd_create_process(current);
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+@@ -221,7 +206,6 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
+ 	q_properties->ctx_save_restore_area_address =
+ 			args->ctx_save_restore_address;
+ 	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
+-	q_properties->ctl_stack_size = args->ctl_stack_size;
+ 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
+ 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
+ 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
+@@ -298,7 +282,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 			p->pasid,
+ 			dev->id);
+ 
+-	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id);
++	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
++				0, q_properties.type, &queue_id);
+ 	if (err != 0)
+ 		goto err_create_queue;
+ 
+@@ -306,16 +291,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
+ 
+ 
+ 	/* Return gpu_id as doorbell offset for mmap usage */
+-	args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
+-	args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
++	args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id);
+ 	args->doorbell_offset <<= PAGE_SHIFT;
+-	if (KFD_IS_SOC15(dev->device_info->asic_family))
+-		/* On SOC15 ASICs, doorbell allocation must be
+-		 * per-device, and independent from the per-process
+-		 * queue_id. Return the doorbell offset within the
+-		 * doorbell aperture to user mode.
+-		 */
+-		args->doorbell_offset |= q_properties.doorbell_off;
+ 
+ 	mutex_unlock(&p->mutex);
+ 
+@@ -403,58 +380,6 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
+ 	return retval;
+ }
+ 
+-static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
+-					void *data)
+-{
+-	int retval;
+-	const int max_num_cus = 1024;
+-	struct kfd_ioctl_set_cu_mask_args *args = data;
+-	struct queue_properties properties;
+-	uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
+-	size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32);
+-
+-	if ((args->num_cu_mask % 32) != 0) {
+-		pr_debug("num_cu_mask 0x%x must be a multiple of 32",
+-				args->num_cu_mask);
+-		return -EINVAL;
+-	}
+-
+-	properties.cu_mask_count = args->num_cu_mask;
+-	if (properties.cu_mask_count == 0) {
+-		pr_debug("CU mask cannot be 0");
+-		return -EINVAL;
+-	}
+-
+-	/* To prevent an unreasonably large CU mask size, set an arbitrary
+-	 * limit of max_num_cus bits.  We can then just drop any CU mask bits
+-	 * past max_num_cus bits and just use the first max_num_cus bits.
+-	 */
+-	if (properties.cu_mask_count > max_num_cus) {
+-		pr_debug("CU mask cannot be greater than 1024 bits");
+-		properties.cu_mask_count = max_num_cus;
+-		cu_mask_size = sizeof(uint32_t) * (max_num_cus/32);
+-	}
+-
+-	properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL);
+-	if (!properties.cu_mask)
+-		return -ENOMEM;
+-
+-	retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size);
+-	if (retval) {
+-		pr_debug("Could not copy CU mask from userspace");
+-		kfree(properties.cu_mask);
+-		return -EFAULT;
+-	}
+-
+-	mutex_lock(&p->mutex);
+-
+-	retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties);
+-
+-	mutex_unlock(&p->mutex);
+-
+-	return retval;
+-}
+-
+ static int kfd_ioctl_set_memory_policy(struct file *filep,
+ 					struct kfd_process *p, void *data)
+ {
+@@ -507,38 +432,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
+ 	return err;
+ }
+ 
+-static int kfd_ioctl_set_trap_handler(struct file *filep,
+-					struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_set_trap_handler_args *args = data;
+-	struct kfd_dev *dev;
+-	int err = 0;
+-	struct kfd_process_device *pdd;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		err = -ESRCH;
+-		goto out;
+-	}
+-
+-	if (dev->dqm->ops.set_trap_handler(dev->dqm,
+-					&pdd->qpd,
+-					args->tba_addr,
+-					args->tma_addr))
+-		err = -EINVAL;
+-
+-out:
+-	mutex_unlock(&p->mutex);
+-
+-	return err;
+-}
+-
+ static int kfd_ioctl_dbg_register(struct file *filep,
+ 				struct kfd_process *p, void *data)
+ {
+@@ -553,8 +446,13 @@ static int kfd_ioctl_dbg_register(struct file *filep,
+ 	if (!dev)
+ 		return -EINVAL;
+ 
+-	mutex_lock(&p->mutex);
++	if (dev->device_info->asic_family == CHIP_CARRIZO) {
++		pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
++		return -EINVAL;
++	}
++
+ 	mutex_lock(kfd_get_dbgmgr_mutex());
++	mutex_lock(&p->mutex);
+ 
+ 	/*
+ 	 * make sure that we have pdd, if this the first queue created for
+@@ -582,8 +480,8 @@ static int kfd_ioctl_dbg_register(struct file *filep,
+ 	}
+ 
+ out:
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
+ 	mutex_unlock(&p->mutex);
++	mutex_unlock(kfd_get_dbgmgr_mutex());
+ 
+ 	return status;
+ }
+@@ -596,9 +494,14 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
+ 	long status;
+ 
+ 	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev || !dev->dbgmgr)
++	if (!dev)
+ 		return -EINVAL;
+ 
++	if (dev->device_info->asic_family == CHIP_CARRIZO) {
++		pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
++		return -EINVAL;
++	}
++
+ 	mutex_lock(kfd_get_dbgmgr_mutex());
+ 
+ 	status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
+@@ -639,6 +542,11 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep,
+ 	if (!dev)
+ 		return -EINVAL;
+ 
++	if (dev->device_info->asic_family == CHIP_CARRIZO) {
++		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
++		return -EINVAL;
++	}
++
+ 	cmd_from_user = (void __user *) args->content_ptr;
+ 
+ 	/* Validate arguments */
+@@ -742,6 +650,11 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
+ 	if (!dev)
+ 		return -EINVAL;
+ 
++	if (dev->device_info->asic_family == CHIP_CARRIZO) {
++		pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
++		return -EINVAL;
++	}
++
+ 	/* input size must match the computed "compact" size */
+ 	if (args->buf_size_in_bytes != computed_buff_size) {
+ 		pr_debug("size mismatch, computed : actual %u : %u\n",
+@@ -800,37 +713,22 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
+ {
+ 	struct kfd_ioctl_get_clock_counters_args *args = data;
+ 	struct kfd_dev *dev;
+-#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \
+-	|| (defined OS_NAME_RHEL_7_2)
+-	struct timespec time;
+-#else
+ 	struct timespec64 time;
+-#endif
+ 
+ 	dev = kfd_device_by_id(args->gpu_id);
+-	if (dev)
+-		/* Reading GPU clock counter from KGD */
+-		args->gpu_clock_counter =
+-			dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
+-	else
+-		/* Node without GPU resource */
+-		args->gpu_clock_counter = 0;
++	if (dev == NULL)
++		return -EINVAL;
++
++	/* Reading GPU clock counter from KGD */
++	args->gpu_clock_counter =
++		dev->kfd2kgd->get_gpu_clock_counter(dev->kgd);
+ 
+ 	/* No access to rdtsc. Using raw monotonic time */
+-#if (defined OS_NAME_RHEL) && (OS_VERSION_MAJOR == 6) \
+-	|| (defined OS_NAME_RHEL_7_2)
+-	getrawmonotonic(&time);
+-	args->cpu_clock_counter = (uint64_t)timespec_to_ns(&time);
+-
+-	get_monotonic_boottime(&time);
+-	args->system_clock_counter = (uint64_t)timespec_to_ns(&time);
+-#else
+ 	getrawmonotonic64(&time);
+ 	args->cpu_clock_counter = (uint64_t)timespec64_to_ns(&time);
+ 
+ 	get_monotonic_boottime64(&time);
+ 	args->system_clock_counter = (uint64_t)timespec64_to_ns(&time);
+-#endif
+ 
+ 	/* Since the counter is in nano-seconds we use 1GHz frequency */
+ 	args->system_clock_freq = 1000000000;
+@@ -895,152 +793,19 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
+ 	return 0;
+ }
+ 
+-static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+-				struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_get_process_apertures_new_args *args = data;
+-	struct kfd_process_device_apertures *pa;
+-	struct kfd_process_device *pdd;
+-	uint32_t nodes = 0;
+-	int ret;
+-
+-	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+-
+-	if (args->num_of_nodes == 0) {
+-		/* Return number of nodes, so that user space can alloacate
+-		 * sufficient memory
+-		 */
+-		mutex_lock(&p->mutex);
+-
+-		if (!kfd_has_process_device_data(p))
+-			goto out_upwrite;
+-
+-		/* Run over all pdd of the process */
+-		pdd = kfd_get_first_process_device_data(p);
+-		do {
+-			args->num_of_nodes++;
+-			pdd = kfd_get_next_process_device_data(p, pdd);
+-		} while (pdd);
+-
+-		goto out_upwrite;
+-	}
+-
+-	/* Fill in process-aperture information for all available
+-	 * nodes, but not more than args->num_of_nodes as that is
+-	 * the amount of memory allocated by user
+-	 */
+-	pa = kzalloc((sizeof(struct kfd_process_device_apertures) *
+-				args->num_of_nodes), GFP_KERNEL);
+-	if (!pa)
+-		return -ENOMEM;
+-
+-	mutex_lock(&p->mutex);
+-
+-	if (!kfd_has_process_device_data(p)) {
+-		args->num_of_nodes = 0;
+-		kfree(pa);
+-		goto out_upwrite;
+-	}
+-
+-	/* Run over all pdd of the process */
+-	pdd = kfd_get_first_process_device_data(p);
+-	do {
+-		pa[nodes].gpu_id = pdd->dev->id;
+-		pa[nodes].lds_base = pdd->lds_base;
+-		pa[nodes].lds_limit = pdd->lds_limit;
+-		pa[nodes].gpuvm_base = pdd->gpuvm_base;
+-		pa[nodes].gpuvm_limit = pdd->gpuvm_limit;
+-		pa[nodes].scratch_base = pdd->scratch_base;
+-		pa[nodes].scratch_limit = pdd->scratch_limit;
+-
+-		dev_dbg(kfd_device,
+-			"gpu id %u\n", pdd->dev->id);
+-		dev_dbg(kfd_device,
+-			"lds_base %llX\n", pdd->lds_base);
+-		dev_dbg(kfd_device,
+-			"lds_limit %llX\n", pdd->lds_limit);
+-		dev_dbg(kfd_device,
+-			"gpuvm_base %llX\n", pdd->gpuvm_base);
+-		dev_dbg(kfd_device,
+-			"gpuvm_limit %llX\n", pdd->gpuvm_limit);
+-		dev_dbg(kfd_device,
+-			"scratch_base %llX\n", pdd->scratch_base);
+-		dev_dbg(kfd_device,
+-			"scratch_limit %llX\n", pdd->scratch_limit);
+-		nodes++;
+-
+-		pdd = kfd_get_next_process_device_data(p, pdd);
+-	} while (pdd && (nodes < args->num_of_nodes));
+-	mutex_unlock(&p->mutex);
+-
+-	args->num_of_nodes = nodes;
+-	ret = copy_to_user(
+-			(void __user *)args->kfd_process_device_apertures_ptr,
+-			pa,
+-			(nodes * sizeof(struct kfd_process_device_apertures)));
+-	kfree(pa);
+-	return ret ? -EFAULT : 0;
+-
+-out_upwrite:
+-	mutex_unlock(&p->mutex);
+-	return 0;
+-}
+-
+ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
+ 					void *data)
+ {
+ 	struct kfd_ioctl_create_event_args *args = data;
+-	struct kfd_dev *kfd;
+-	struct kfd_process_device *pdd;
+-	int err = -EINVAL;
+-	void *mem, *kern_addr = NULL;
+-
+-	pr_debug("Event page offset 0x%llx\n", args->event_page_offset);
+-
+-	if (args->event_page_offset) {
+-		kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
+-		if (!kfd) {
+-			pr_err("Getting device by id failed in %s\n", __func__);
+-			return -EFAULT;
+-		}
+-		if (!kfd->device_info->is_need_iommu_device) {
+-			mutex_lock(&p->mutex);
+-			pdd = kfd_bind_process_to_device(kfd, p);
+-			if (IS_ERR(pdd)) {
+-				err = PTR_ERR(pdd);
+-				goto out_upwrite;
+-			}
+-			mem = kfd_process_device_translate_handle(pdd,
+-				GET_IDR_HANDLE(args->event_page_offset));
+-			if (!mem) {
+-				pr_err("Can't find BO, offset is 0x%llx\n",
+-						args->event_page_offset);
+-				err = -EFAULT;
+-				goto out_upwrite;
+-			}
+-			mutex_unlock(&p->mutex);
+-
+-			/* Map dGPU gtt BO to kernel */
+-			kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd,
+-					mem, &kern_addr);
+-		}
+-	}
++	int err;
+ 
+-	err = kfd_event_create(filp, p,
+-			args->event_type,
+-			args->auto_reset != 0,
+-			args->node_id,
+-			&args->event_id,
+-			&args->event_trigger_data,
+-			&args->event_page_offset,
+-			&args->event_slot_index,
+-			kern_addr);
++	err = kfd_event_create(filp, p, args->event_type,
++				args->auto_reset != 0, args->node_id,
++				&args->event_id, &args->event_trigger_data,
++				&args->event_page_offset,
++				&args->event_slot_index);
+ 
+ 	return err;
+-
+-out_upwrite:
+-	mutex_unlock(&p->mutex);
+-	return err;
+ }
+ 
+ static int kfd_ioctl_destroy_event(struct file *filp, struct kfd_process *p,
+@@ -1071,26 +836,26 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
+ 				void *data)
+ {
+ 	struct kfd_ioctl_wait_events_args *args = data;
++	enum kfd_event_wait_result wait_result;
+ 	int err;
+ 
+ 	err = kfd_wait_on_events(p, args->num_events,
+ 			(void __user *)args->events_ptr,
+ 			(args->wait_for_all != 0),
+-			args->timeout, &args->wait_result);
++			args->timeout, &wait_result);
++
++	args->wait_result = wait_result;
+ 
+ 	return err;
+ }
+-static int kfd_ioctl_alloc_scratch_memory(struct file *filep,
++static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
+ 					struct kfd_process *p, void *data)
+ {
+-	struct kfd_ioctl_alloc_memory_of_scratch_args *args = data;
++	struct kfd_ioctl_set_scratch_backing_va_args *args = data;
+ 	struct kfd_process_device *pdd;
+ 	struct kfd_dev *dev;
+ 	long err;
+ 
+-	if (args->size == 0)
+-		return -EINVAL;
+-
+ 	dev = kfd_device_by_id(args->gpu_id);
+ 	if (!dev)
+ 		return -EINVAL;
+@@ -1107,521 +872,17 @@ static int kfd_ioctl_alloc_scratch_memory(struct file *filep,
+ 
+ 	mutex_unlock(&p->mutex);
+ 
+-	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+-	    pdd->qpd.vmid != 0) {
+-		err = dev->kfd2kgd->alloc_memory_of_scratch(
++	if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0)
++		dev->kfd2kgd->set_scratch_backing_va(
+ 			dev->kgd, args->va_addr, pdd->qpd.vmid);
+-		if (err != 0)
+-			goto alloc_memory_of_scratch_failed;
+-	}
+ 
+ 	return 0;
+ 
+ bind_process_to_device_fail:
+ 	mutex_unlock(&p->mutex);
+-alloc_memory_of_scratch_failed:
+-	return -EFAULT;
+-}
+-
+-bool kfd_dev_is_large_bar(struct kfd_dev *dev)
+-{
+-	struct kfd_local_mem_info mem_info;
+-
+-	if (debug_largebar) {
+-		pr_debug("Simulate large-bar allocation on non large-bar machine\n");
+-		return true;
+-	}
+-
+-	if (dev->device_info->is_need_iommu_device)
+-		return false;
+-
+-	dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info);
+-	if (mem_info.local_mem_size_private == 0 &&
+-			mem_info.local_mem_size_public > 0)
+-		return true;
+-	return false;
+-}
+-
+-static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
+-					struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_alloc_memory_of_gpu_args *args = data;
+-	struct kfd_process_device *pdd;
+-	void *mem;
+-	struct kfd_dev *dev;
+-	int idr_handle;
+-	long err;
+-	uint64_t offset = args->mmap_offset;
+-	uint32_t flags = args->flags;
+-	struct vm_area_struct *vma;
+-
+-	if (args->size == 0)
+-		return -EINVAL;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
+-		(flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
+-		!kfd_dev_is_large_bar(dev)) {
+-		pr_err("Alloc host visible vram on small bar is not allowed\n");
+-		return -EINVAL;
+-	}
+-
+-	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+-		/* Check if the userptr corresponds to another (or third-party)
+-		 * device local memory. If so treat is as a doorbell. User
+-		 * space will be oblivious of this and will use this doorbell
+-		 * BO as a regular userptr BO
+-		 */
+-		vma = find_vma(current->mm, args->mmap_offset);
+-		if (vma && (vma->vm_flags & VM_IO)) {
+-			unsigned long pfn;
+-
+-			follow_pfn(vma, args->mmap_offset, &pfn);
+-			flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL;
+-			flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR;
+-			offset = (pfn << PAGE_SHIFT);
+-		}
+-	} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
+-		if (args->size != kfd_doorbell_process_slice(dev))
+-			return -EINVAL;
+-		offset = kfd_get_process_doorbells(dev, p);
+-	}
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		err = PTR_ERR(pdd);
+-		goto err_unlock;
+-	}
+-
+-	err = dev->kfd2kgd->alloc_memory_of_gpu(
+-		dev->kgd, args->va_addr, args->size,
+-		pdd->vm, (struct kgd_mem **) &mem, &offset,
+-		flags);
+-
+-	if (err)
+-		goto err_unlock;
+-
+-	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+-			args->va_addr, args->size, NULL);
+-	if (idr_handle < 0) {
+-		err = -EFAULT;
+-		goto err_free;
+-	}
+-
+-	mutex_unlock(&p->mutex);
+-
+-	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
+-	args->mmap_offset = offset;
+-
+-	return 0;
+-
+-err_free:
+-	dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
+-					 (struct kgd_mem *) mem,
+-					 pdd->vm);
+-err_unlock:
+-	mutex_unlock(&p->mutex);
+ 	return err;
+ }
+ 
+-static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
+-					struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_free_memory_of_gpu_args *args = data;
+-	struct kfd_process_device *pdd;
+-	struct kfd_bo *buf_obj;
+-	struct kfd_dev *dev;
+-	int ret;
+-
+-	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+-	if (!dev)
+-		return -EINVAL;
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_get_process_device_data(dev, p);
+-	if (!pdd) {
+-		pr_err("Process device data doesn't exist\n");
+-		ret = -EINVAL;
+-		goto err_unlock;
+-	}
+-
+-	buf_obj = kfd_process_device_find_bo(pdd,
+-					GET_IDR_HANDLE(args->handle));
+-	if (!buf_obj) {
+-		ret = -EINVAL;
+-		goto err_unlock;
+-	}
+-	run_rdma_free_callback(buf_obj);
+-
+-	ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem,
+-					       pdd->vm);
+-
+-	/* If freeing the buffer failed, leave the handle in place for
+-	 * clean-up during process tear-down.
+-	 */
+-	if (ret == 0)
+-		kfd_process_device_remove_obj_handle(
+-			pdd, GET_IDR_HANDLE(args->handle));
+-
+-err_unlock:
+-	mutex_unlock(&p->mutex);
+-	return ret;
+-}
+-
+-static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
+-					struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_map_memory_to_gpu_args *args = data;
+-	struct kfd_process_device *pdd, *peer_pdd;
+-	void *mem;
+-	struct kfd_dev *dev, *peer;
+-	long err = 0;
+-	int i, num_dev = 0;
+-	uint32_t *devices_arr = NULL;
+-
+-	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+-	if (!dev)
+-		return -EINVAL;
+-
+-	if (args->device_ids_array_size == 0) {
+-		pr_debug("Device ID array size is 0\n");
+-		return -EINVAL;
+-	}
+-
+-	if (args->device_ids_array_size % sizeof(uint32_t)) {
+-		pr_debug("Node IDs array size %u\n",
+-				args->device_ids_array_size);
+-		return -EFAULT;
+-	}
+-
+-	devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+-	if (!devices_arr)
+-		return -ENOMEM;
+-
+-	err = copy_from_user(devices_arr,
+-			(void __user *)args->device_ids_array_ptr,
+-			args->device_ids_array_size);
+-	if (err != 0) {
+-		err = -EFAULT;
+-		goto copy_from_user_failed;
+-	}
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		err = PTR_ERR(pdd);
+-		goto bind_process_to_device_failed;
+-	}
+-
+-	mem = kfd_process_device_translate_handle(pdd,
+-						GET_IDR_HANDLE(args->handle));
+-	if (!mem) {
+-		err = -ENOMEM;
+-		goto get_mem_obj_from_handle_failed;
+-	}
+-
+-	num_dev = args->device_ids_array_size / sizeof(uint32_t);
+-	for (i = 0 ; i < num_dev; i++) {
+-		peer = kfd_device_by_id(devices_arr[i]);
+-		if (!peer) {
+-			pr_debug("Getting device by id failed for 0x%x\n",
+-					devices_arr[i]);
+-			err = -EFAULT;
+-			goto get_mem_obj_from_handle_failed;
+-		}
+-
+-		peer_pdd = kfd_bind_process_to_device(peer, p);
+-		if (!peer_pdd) {
+-			err = -EFAULT;
+-			goto get_mem_obj_from_handle_failed;
+-		}
+-		err = peer->kfd2kgd->map_memory_to_gpu(
+-				peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
+-		if (err != 0) {
+-			pr_err("Failed to map to gpu %d, num_dev=%d\n",
+-					i, num_dev);
+-			goto map_memory_to_gpu_failed;
+-		}
+-	}
+-
+-	mutex_unlock(&p->mutex);
+-
+-	err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true);
+-	if (err) {
+-		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+-		goto sync_memory_failed;
+-	}
+-
+-	/* Flush TLBs after waiting for the page table updates to complete */
+-	for (i = 0; i < num_dev; i++) {
+-		peer = kfd_device_by_id(devices_arr[i]);
+-		if (WARN_ON_ONCE(!peer))
+-			continue;
+-		kfd_flush_tlb(peer, p);
+-	}
+-
+-	kfree(devices_arr);
+-
+-	return err;
+-
+-bind_process_to_device_failed:
+-get_mem_obj_from_handle_failed:
+-map_memory_to_gpu_failed:
+-	mutex_unlock(&p->mutex);
+-copy_from_user_failed:
+-sync_memory_failed:
+-	kfree(devices_arr);
+-
+-	return err;
+-}
+-
+-int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd)
+-{
+-	int err;
+-	struct kfd_dev *dev = pdd->dev;
+-
+-	err = dev->kfd2kgd->unmap_memory_to_gpu(
+-		dev->kgd, (struct kgd_mem *) mem, pdd->vm);
+-
+-	if (err != 0)
+-		return err;
+-
+-	kfd_flush_tlb(dev, pdd->process);
+-
+-	return 0;
+-}
+-
+-static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+-					struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
+-	struct kfd_process_device *pdd, *peer_pdd;
+-	void *mem;
+-	struct kfd_dev *dev, *peer;
+-	long err = 0;
+-	uint32_t *devices_arr = NULL, num_dev, i;
+-
+-	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+-	if (!dev)
+-		return -EINVAL;
+-
+-	if (args->device_ids_array_size == 0) {
+-		pr_debug("Device ID array size is 0\n");
+-		return -EINVAL;
+-	}
+-
+-	if (args->device_ids_array_size % sizeof(uint32_t)) {
+-		pr_debug("Node IDs array size %u\n",
+-				args->device_ids_array_size);
+-		return -EFAULT;
+-	}
+-
+-	devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL);
+-	if (!devices_arr)
+-		return -ENOMEM;
+-
+-	err = copy_from_user(devices_arr,
+-			(void __user *)args->device_ids_array_ptr,
+-			args->device_ids_array_size);
+-	if (err != 0) {
+-		err = -EFAULT;
+-		goto copy_from_user_failed;
+-	}
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_get_process_device_data(dev, p);
+-	if (!pdd) {
+-		pr_debug("Process device data doesn't exist\n");
+-		err = -ENODEV;
+-		goto bind_process_to_device_failed;
+-	}
+-
+-	mem = kfd_process_device_translate_handle(pdd,
+-						GET_IDR_HANDLE(args->handle));
+-	if (!mem) {
+-		err = -ENOMEM;
+-		goto get_mem_obj_from_handle_failed;
+-	}
+-
+-	num_dev = args->device_ids_array_size / sizeof(uint32_t);
+-	for (i = 0 ; i < num_dev; i++) {
+-		peer = kfd_device_by_id(devices_arr[i]);
+-		if (!peer) {
+-			err = -EFAULT;
+-			goto get_mem_obj_from_handle_failed;
+-		}
+-
+-		peer_pdd = kfd_get_process_device_data(peer, p);
+-		if (!peer_pdd) {
+-			err = -EFAULT;
+-			goto get_mem_obj_from_handle_failed;
+-		}
+-		kfd_unmap_memory_from_gpu(mem, peer_pdd);
+-	}
+-	kfree(devices_arr);
+-
+-	mutex_unlock(&p->mutex);
+-
+-	return 0;
+-
+-bind_process_to_device_failed:
+-get_mem_obj_from_handle_failed:
+-	mutex_unlock(&p->mutex);
+-copy_from_user_failed:
+-	kfree(devices_arr);
+-	return err;
+-}
+-
+-static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep,
+-		struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_set_process_dgpu_aperture_args *args = data;
+-	struct kfd_dev *dev;
+-	struct kfd_process_device *pdd;
+-	long err;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		err = PTR_ERR(pdd);
+-		goto exit;
+-	}
+-
+-	err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base,
+-			args->dgpu_limit);
+-
+-exit:
+-	mutex_unlock(&p->mutex);
+-	return err;
+-}
+-
+-static int kfd_ioctl_get_dmabuf_info(struct file *filep,
+-		struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_get_dmabuf_info_args *args = data;
+-	struct kfd_dev *dev = NULL;
+-	struct kgd_dev *dma_buf_kgd;
+-	void *metadata_buffer = NULL;
+-	uint32_t flags;
+-	unsigned int i;
+-	int r;
+-
+-	/* Find a KFD GPU device that supports the get_dmabuf_info query */
+-	for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++)
+-		if (dev && dev->kfd2kgd->get_dmabuf_info)
+-			break;
+-	if (!dev)
+-		return -EINVAL;
+-
+-	if (args->metadata_ptr) {
+-		metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL);
+-		if (!metadata_buffer)
+-			return -ENOMEM;
+-	}
+-
+-	/* Get dmabuf info from KGD */
+-	r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd,
+-					  &dma_buf_kgd, &args->size,
+-					  metadata_buffer, args->metadata_size,
+-					  &args->metadata_size, &flags);
+-	if (r)
+-		goto exit;
+-
+-	/* Reverse-lookup gpu_id from kgd pointer */
+-	dev = kfd_device_by_kgd(dma_buf_kgd);
+-	if (!dev) {
+-		r = -EINVAL;
+-		goto exit;
+-	}
+-	args->gpu_id = dev->id;
+-	args->flags = flags;
+-
+-	/* Copy metadata buffer to user mode */
+-	if (metadata_buffer) {
+-		r = copy_to_user((void __user *)args->metadata_ptr,
+-				 metadata_buffer, args->metadata_size);
+-		if (r != 0)
+-			r = -EFAULT;
+-	}
+-
+-exit:
+-	kfree(metadata_buffer);
+-
+-	return r;
+-}
+-
+-static int kfd_ioctl_import_dmabuf(struct file *filep,
+-				   struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_import_dmabuf_args *args = data;
+-	struct kfd_dev *dev;
+-	int r;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd,
+-				  args->va_addr, &args->handle, NULL);
+-	if (r)
+-		pr_err("Failed to import dmabuf\n");
+-
+-	return r;
+-}
+-
+-static int kfd_ioctl_ipc_export_handle(struct file *filep,
+-				       struct kfd_process *p,
+-				       void *data)
+-{
+-	struct kfd_ioctl_ipc_export_handle_args *args = data;
+-	struct kfd_dev *dev;
+-	int r;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle);
+-	if (r)
+-		pr_err("Failed to export IPC handle\n");
+-
+-	return r;
+-}
+-
+-static int kfd_ioctl_ipc_import_handle(struct file *filep,
+-				       struct kfd_process *p,
+-				       void *data)
+-{
+-	struct kfd_ioctl_ipc_import_handle_args *args = data;
+-	struct kfd_dev *dev = NULL;
+-	int r;
+-
+-	dev = kfd_device_by_id(args->gpu_id);
+-	if (!dev)
+-		return -EINVAL;
+-
+-	r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle,
+-				  args->va_addr, &args->handle,
+-				  &args->mmap_offset);
+-	if (r)
+-		pr_err("Failed to import IPC handle\n");
+-
+-	return r;
+-}
+-
+ static int kfd_ioctl_get_tile_config(struct file *filep,
+ 		struct kfd_process *p, void *data)
+ {
+@@ -1664,283 +925,6 @@ static int kfd_ioctl_get_tile_config(struct file *filep,
+ 	return 0;
+ }
+ 
+-#ifndef PTRACE_MODE_ATTACH_REALCREDS
+-#define PTRACE_MODE_ATTACH_REALCREDS  PTRACE_MODE_ATTACH
+-#endif
+-
+-static int kfd_ioctl_cross_memory_copy(struct file *filep,
+-				       struct kfd_process *local_p, void *data)
+-{
+-	struct kfd_ioctl_cross_memory_copy_args *args = data;
+-	struct kfd_memory_range *src_array, *dst_array;
+-	struct kfd_bo *src_bo, *dst_bo;
+-	struct kfd_process *remote_p, *src_p, *dst_p;
+-	struct task_struct *remote_task;
+-	struct mm_struct *remote_mm;
+-	struct pid *remote_pid;
+-	struct dma_fence *fence = NULL, *lfence = NULL;
+-	uint64_t dst_va_addr;
+-	uint64_t copied, total_copied = 0;
+-	uint64_t src_offset, dst_offset, dst_va_addr_end;
+-	const char *cma_op;
+-	int i, j = 0, err = 0;
+-
+-	/* Check parameters */
+-	if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 ||
+-		args->src_mem_array_size == 0 || args->dst_mem_array_size == 0)
+-		return -EINVAL;
+-	args->bytes_copied = 0;
+-
+-	/* Allocate space for source and destination arrays */
+-	src_array = kmalloc_array((args->src_mem_array_size +
+-				  args->dst_mem_array_size),
+-				  sizeof(struct kfd_memory_range),
+-				  GFP_KERNEL);
+-	if (!src_array)
+-		return -ENOMEM;
+-	dst_array = &src_array[args->src_mem_array_size];
+-
+-	if (copy_from_user(src_array, (void __user *)args->src_mem_range_array,
+-			   args->src_mem_array_size *
+-			   sizeof(struct kfd_memory_range))) {
+-		err = -EFAULT;
+-		goto copy_from_user_fail;
+-	}
+-	if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array,
+-			   args->dst_mem_array_size *
+-			   sizeof(struct kfd_memory_range))) {
+-		err = -EFAULT;
+-		goto copy_from_user_fail;
+-	}
+-
+-	/* Get remote process */
+-	remote_pid = find_get_pid(args->pid);
+-	if (!remote_pid) {
+-		pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid);
+-		err = -ESRCH;
+-		goto copy_from_user_fail;
+-	}
+-
+-	remote_task = get_pid_task(remote_pid, PIDTYPE_PID);
+-	if (!remote_pid) {
+-		pr_err("Cross mem copy failed. Invalid PID or task died %d\n",
+-			args->pid);
+-		err = -ESRCH;
+-		goto get_pid_task_fail;
+-	}
+-
+-	/* Check access permission */
+-	remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS);
+-	if (!remote_mm || IS_ERR(remote_mm)) {
+-		err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH;
+-		if (err == -EACCES) {
+-			pr_err("Cross mem copy failed. Permission error\n");
+-			err = -EPERM;
+-		} else
+-			pr_err("Cross mem copy failed. Invalid task %d\n",
+-			       err);
+-		goto mm_access_fail;
+-	}
+-
+-	remote_p = kfd_get_process(remote_task);
+-	if (!remote_p) {
+-		pr_err("Cross mem copy failed. Invalid kfd process %d\n",
+-		       args->pid);
+-		err = -EINVAL;
+-		goto kfd_process_fail;
+-	}
+-
+-	if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) {
+-		src_p = local_p;
+-		dst_p = remote_p;
+-		cma_op = "WRITE";
+-		pr_debug("CMA WRITE: local -> remote\n");
+-	} else {
+-		src_p = remote_p;
+-		dst_p = local_p;
+-		cma_op = "READ";
+-		pr_debug("CMA READ: remote -> local\n");
+-	}
+-
+-
+-	/* For each source kfd_range:
+-	 * - Find the BO. Each range has to be within the same BO.
+-	 * - Copy this range to single or multiple destination BOs.
+-	 * - dst_va_addr - will point to next va address into which data will
+-	 *                 be copied.
+-	 * - dst_bo & src_bo - the current destination and source BOs
+-	 * - src_offset & dst_offset - offset into the respective BOs from
+-	 *                             data will be sourced or copied
+-	 */
+-	dst_va_addr = dst_array[0].va_addr;
+-	dst_va_addr_end = dst_va_addr + dst_array[0].size - 1;
+-	mutex_lock(&dst_p->mutex);
+-	dst_bo = kfd_process_find_bo_from_interval(dst_p,
+-			dst_va_addr,
+-			dst_va_addr_end);
+-	mutex_unlock(&dst_p->mutex);
+-	if (!dst_bo || dst_va_addr_end > dst_bo->it.last) {
+-		pr_err("CMA %s failed. Invalid dst range\n", cma_op);
+-		err = -EFAULT;
+-		goto kfd_process_fail;
+-	}
+-	dst_offset = dst_va_addr - dst_bo->it.start;
+-
+-	for (i = 0; i < args->src_mem_array_size; i++) {
+-		uint64_t src_va_addr_end = src_array[i].va_addr +
+-					   src_array[i].size - 1;
+-		uint64_t src_size_to_copy = src_array[i].size;
+-
+-		mutex_lock(&src_p->mutex);
+-		src_bo = kfd_process_find_bo_from_interval(src_p,
+-				src_array[i].va_addr,
+-				src_va_addr_end);
+-		mutex_unlock(&src_p->mutex);
+-		if (!src_bo || src_va_addr_end > src_bo->it.last) {
+-			pr_err("CMA %s failed. Invalid src range\n", cma_op);
+-			err = -EFAULT;
+-			break;
+-		}
+-
+-		src_offset = src_array[i].va_addr - src_bo->it.start;
+-
+-		/* Copy src_bo to one or multiple dst_bo(s) based on size and
+-		 * and current copy location.
+-		 */
+-		while (j < args->dst_mem_array_size) {
+-			uint64_t copy_size;
+-			int64_t space_left;
+-
+-			/* Find the current copy_size. This will be smaller of
+-			 * the following
+-			 * - space left in the current dest memory range
+-			 * - data left to copy from source range
+-			 */
+-			space_left = (dst_array[j].va_addr + dst_array[j].size)
+-					- dst_va_addr;
+-			copy_size = (src_size_to_copy < space_left) ?
+-					src_size_to_copy : space_left;
+-
+-			/* Check both BOs belong to same device */
+-			if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+-				pr_err("CMA %s fail. Not same dev\n", cma_op);
+-				err = -EINVAL;
+-				break;
+-			}
+-
+-			/* Store prev fence. Release it when a later fence is
+-			 * created
+-			 */
+-			lfence = fence;
+-			fence = NULL;
+-
+-			err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(
+-				src_bo->dev->kgd,
+-				src_bo->mem, src_offset,
+-				dst_bo->mem, dst_offset,
+-				copy_size,
+-				&fence, &copied);
+-
+-			if (err) {
+-				pr_err("GPU CMA %s failed\n", cma_op);
+-				err = -EFAULT;
+-				break;
+-			}
+-
+-			/* Later fence available. Release old fence */
+-			if (fence && lfence) {
+-				dma_fence_put(lfence);
+-				lfence = NULL;
+-			}
+-
+-			total_copied += copied;
+-			src_size_to_copy -= copied;
+-			space_left -= copied;
+-			dst_va_addr += copied;
+-			dst_offset += copied;
+-			src_offset += copied;
+-			if (dst_va_addr > dst_bo->it.last + 1) {
+-				pr_err("CMA %s fail. Mem overflow\n", cma_op);
+-				err = -EFAULT;
+-				break;
+-			}
+-
+-			/* If the cur dest range is full move to next one */
+-			if (space_left <= 0) {
+-				if (++j >= args->dst_mem_array_size)
+-					break;
+-
+-				dst_va_addr = dst_array[j].va_addr;
+-				dst_va_addr_end = dst_va_addr +
+-						  dst_array[j].size - 1;
+-				dst_bo = kfd_process_find_bo_from_interval(
+-						dst_p,
+-						dst_va_addr,
+-						dst_va_addr_end);
+-				if (!dst_bo ||
+-				    dst_va_addr_end > dst_bo->it.last) {
+-					pr_err("CMA %s failed. Invalid dst range\n",
+-					       cma_op);
+-					err = -EFAULT;
+-					break;
+-				}
+-				dst_offset = dst_va_addr - dst_bo->it.start;
+-			}
+-
+-			/* If the cur src range is done, move to next one */
+-			if (src_size_to_copy <= 0)
+-				break;
+-		}
+-		if (err)
+-			break;
+-	}
+-
+-	/* Wait for the last fence irrespective of error condition */
+-	if (fence) {
+-		if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000))
+-			< 0)
+-			pr_err("CMA %s failed. BO timed out\n", cma_op);
+-		dma_fence_put(fence);
+-	} else if (lfence) {
+-		pr_debug("GPU copy fail. But wait for prev DMA to finish\n");
+-		dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000));
+-		dma_fence_put(lfence);
+-	}
+-
+-kfd_process_fail:
+-	mmput(remote_mm);
+-mm_access_fail:
+-	put_task_struct(remote_task);
+-get_pid_task_fail:
+-	put_pid(remote_pid);
+-copy_from_user_fail:
+-	kfree(src_array);
+-
+-	/* An error could happen after partial copy. In that case this will
+-	 * reflect partial amount of bytes copied
+-	 */
+-	args->bytes_copied = total_copied;
+-	return err;
+-}
+-
+-static int kfd_ioctl_get_queue_wave_state(struct file *filep,
+-					  struct kfd_process *p, void *data)
+-{
+-	struct kfd_ioctl_get_queue_wave_state_args *args = data;
+-	int r;
+-
+-	mutex_lock(&p->mutex);
+-
+-	r = pqm_get_wave_state(&p->pqm, args->queue_id,
+-			       (void __user *)args->ctl_stack_address,
+-			       &args->ctl_stack_used_size,
+-			       &args->save_area_used_size);
+-
+-	mutex_unlock(&p->mutex);
+-
+-	return r;
+-}
+-
+ #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
+ 	[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
+ 			    .cmd_drv = 0, .name = #ioctl}
+@@ -1995,54 +979,11 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+ 			kfd_ioctl_dbg_wave_control, 0),
+ 
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU,
+-			kfd_ioctl_alloc_memory_of_gpu, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU,
+-			kfd_ioctl_free_memory_of_gpu, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU,
+-			kfd_ioctl_map_memory_to_gpu, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU,
+-			kfd_ioctl_unmap_memory_from_gpu, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH,
+-			kfd_ioctl_alloc_scratch_memory, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK,
+-			kfd_ioctl_set_cu_mask, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE,
+-			kfd_ioctl_set_process_dgpu_aperture, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+-			kfd_ioctl_set_trap_handler, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+-				kfd_ioctl_get_process_apertures_new, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO,
+-				kfd_ioctl_get_dmabuf_info, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF,
+-				kfd_ioctl_import_dmabuf, 0),
++	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA,
++			kfd_ioctl_set_scratch_backing_va, 0),
+ 
+ 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
+-				kfd_ioctl_get_tile_config, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE,
+-				kfd_ioctl_ipc_import_handle, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE,
+-				kfd_ioctl_ipc_export_handle, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY,
+-				kfd_ioctl_cross_memory_copy, 0),
+-
+-	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE,
+-				kfd_ioctl_get_queue_wave_state, 0)
+-
++			kfd_ioctl_get_tile_config, 0)
+ };
+ 
+ #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
+@@ -2138,34 +1079,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
+ {
+ 	struct kfd_process *process;
+-	struct kfd_dev *kfd;
+-	unsigned long vm_pgoff;
+-	unsigned long long mmap_type;
+ 
+ 	process = kfd_get_process(current);
+ 	if (IS_ERR(process))
+ 		return PTR_ERR(process);
+ 
+-	vm_pgoff = vma->vm_pgoff;
+-	vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff);
+-	mmap_type = vm_pgoff & KFD_MMAP_TYPE_MASK;
+-
+-	switch (mmap_type) {
+-	case KFD_MMAP_TYPE_DOORBELL:
+-		kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff));
+-		if (!kfd)
+-			return -EFAULT;
+-		return kfd_doorbell_mmap(kfd, process, vma);
+-
+-	case KFD_MMAP_TYPE_EVENTS:
++	if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) ==
++			KFD_MMAP_DOORBELL_MASK) {
++		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK;
++		return kfd_doorbell_mmap(process, vma);
++	} else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) ==
++			KFD_MMAP_EVENTS_MASK) {
++		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
+ 		return kfd_event_mmap(process, vma);
+-
+-	case KFD_MMAP_TYPE_RESERVED_MEM:
+-		return kfd_reserved_mem_mmap(process, vma);
+-
+-	default:
+-		pr_err("Unsupported kfd mmap type %llx\n", mmap_type);
+-		break;
+ 	}
+ 
+ 	return -EFAULT;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+deleted file mode 100644
+index 71525cf..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
++++ /dev/null
+@@ -1,1339 +0,0 @@
+-#include <linux/kernel.h>
+-#include <linux/acpi.h>
+-#include <linux/mm.h>
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-#include <linux/amd-iommu.h>
+-#endif
+-#include <linux/pci.h>
+-#include "kfd_crat.h"
+-#include "kfd_priv.h"
+-#include "kfd_topology.h"
+-
+-/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
+- * GPU processor ID are expressed with Bit[31]=1.
+- * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
+- * used in the CRAT.
+- */
+-static uint32_t gpu_processor_id_low = 0x80001000;
+-
+-/* Return the next available gpu_processor_id and increment it for next GPU
+- *	@total_cu_count - Total CUs present in the GPU including ones
+- *			  masked off
+- */
+-static inline unsigned int get_and_inc_gpu_processor_id(
+-				unsigned int total_cu_count)
+-{
+-	int current_id = gpu_processor_id_low;
+-
+-	gpu_processor_id_low += total_cu_count;
+-	return current_id;
+-}
+-
+-/* Static table to describe GPU Cache information */
+-struct kfd_gpu_cache_info {
+-	uint32_t	cache_size;
+-	uint32_t	cache_level;
+-	uint32_t	flags;
+-	/* Indicates how many Compute Units share this cache
+-	 * Value = 1 indicates the cache is not shared
+-	 */
+-	uint32_t	num_cu_shared;
+-};
+-
+-static struct kfd_gpu_cache_info kaveri_cache_info[] = {
+-	{
+-		/* TCP L1 Cache per CU */
+-		.cache_size = 16,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_DATA_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 1,
+-
+-	},
+-	{
+-		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+-		.cache_size = 16,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_INST_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 2,
+-	},
+-	{
+-		/* Scalar L1 Data Cache (in SQC module) per bank */
+-		.cache_size = 8,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_DATA_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 2,
+-	},
+-
+-	/* TODO: Add L2 Cache information */
+-};
+-
+-
+-static struct kfd_gpu_cache_info carrizo_cache_info[] = {
+-	{
+-		/* TCP L1 Cache per CU */
+-		.cache_size = 16,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_DATA_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 1,
+-	},
+-	{
+-		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+-		.cache_size = 8,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_INST_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 4,
+-	},
+-	{
+-		/* Scalar L1 Data Cache (in SQC module) per bank. */
+-		.cache_size = 4,
+-		.cache_level = 1,
+-		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+-				CRAT_CACHE_FLAGS_DATA_CACHE |
+-				CRAT_CACHE_FLAGS_SIMD_CACHE),
+-		.num_cu_shared = 4,
+-	},
+-
+-	/* TODO: Add L2 Cache information */
+-};
+-
+-/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
+- * the following ASICs may need a separate table.
+- */
+-#define hawaii_cache_info kaveri_cache_info
+-#define tonga_cache_info carrizo_cache_info
+-#define fiji_cache_info  carrizo_cache_info
+-#define polaris10_cache_info carrizo_cache_info
+-#define polaris11_cache_info carrizo_cache_info
+-/* TODO - check & update Vega10 cache details */
+-#define vega10_cache_info carrizo_cache_info
+-#define raven_cache_info carrizo_cache_info
+-
+-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
+-		struct crat_subtype_computeunit *cu)
+-{
+-	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
+-	dev->node_props.cpu_core_id_base = cu->processor_id_low;
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
+-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+-#endif
+-
+-	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
+-			cu->processor_id_low);
+-}
+-
+-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
+-		struct crat_subtype_computeunit *cu)
+-{
+-	dev->node_props.simd_id_base = cu->processor_id_low;
+-	dev->node_props.simd_count = cu->num_simd_cores;
+-	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
+-	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
+-	dev->node_props.wave_front_size = cu->wave_front_size;
+-	dev->node_props.array_count = cu->array_count;
+-	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
+-	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
+-	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
+-	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
+-		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
+-	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
+-}
+-
+-/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
+- * topology device present in the device_list
+- */
+-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
+-				struct list_head *device_list)
+-{
+-	struct kfd_topology_device *dev;
+-
+-	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
+-			cu->proximity_domain, cu->hsa_capability);
+-	list_for_each_entry(dev, device_list, list) {
+-		if (cu->proximity_domain == dev->proximity_domain) {
+-			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
+-				kfd_populated_cu_info_cpu(dev, cu);
+-
+-			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
+-				kfd_populated_cu_info_gpu(dev, cu);
+-			break;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+-static struct kfd_mem_properties *
+-find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width,
+-		struct kfd_topology_device *dev)
+-{
+-	struct kfd_mem_properties *props;
+-
+-	list_for_each_entry(props, &dev->mem_props, list) {
+-		if (props->heap_type == heap_type
+-				&& props->flags == flags
+-				&& props->width == width)
+-			return props;
+-	}
+-
+-	return NULL;
+-}
+-/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
+- * topology device present in the device_list
+- */
+-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
+-				struct list_head *device_list)
+-{
+-	struct kfd_mem_properties *props;
+-	struct kfd_topology_device *dev;
+-	uint32_t heap_type;
+-	uint64_t size_in_bytes;
+-	uint32_t flags = 0;
+-	uint32_t width;
+-
+-	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
+-			mem->proximity_domain);
+-	list_for_each_entry(dev, device_list, list) {
+-		if (mem->proximity_domain == dev->proximity_domain) {
+-			/* We're on GPU node */
+-			if (dev->node_props.cpu_cores_count == 0) {
+-				/* APU */
+-				if (mem->visibility_type == 0)
+-					heap_type =
+-						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
+-				/* dGPU */
+-				else
+-					heap_type = mem->visibility_type;
+-			} else
+-				heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
+-
+-			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
+-				flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
+-			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
+-				flags |= HSA_MEM_FLAGS_NON_VOLATILE;
+-
+-			size_in_bytes =
+-				((uint64_t)mem->length_high << 32) +
+-							mem->length_low;
+-			width = mem->width;
+-
+-			/* Multiple banks of the same type are aggregated into
+-			 * one. User mode doesn't care about multiple physical
+-			 * memory segments. It's managed as a single virtual
+-			 * heap for user mode.
+-			 */
+-			props = find_subtype_mem(heap_type, flags, width, dev);
+-			if (props) {
+-				props->size_in_bytes += size_in_bytes;
+-				break;
+-			}
+-
+-			props = kfd_alloc_struct(props);
+-			if (!props)
+-				return -ENOMEM;
+-
+-			props->heap_type = heap_type;
+-			props->flags = flags;
+-			props->size_in_bytes = size_in_bytes;
+-			props->width = width;
+-
+-			dev->node_props.mem_banks_count++;
+-			list_add_tail(&props->list, &dev->mem_props);
+-
+-			break;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+-/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
+- * topology device present in the device_list
+- */
+-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
+-			struct list_head *device_list)
+-{
+-	struct kfd_cache_properties *props;
+-	struct kfd_topology_device *dev;
+-	uint32_t id;
+-	uint32_t total_num_of_cu;
+-
+-	id = cache->processor_id_low;
+-
+-	list_for_each_entry(dev, device_list, list) {
+-		total_num_of_cu = (dev->node_props.array_count *
+-					dev->node_props.cu_per_simd_array);
+-
+-		/* Cache infomration in CRAT doesn't have proximity_domain
+-		 * information as it is associated with a CPU core or GPU
+-		 * Compute Unit. So map the cache using CPU core Id or SIMD
+-		 * (GPU) ID.
+-		 * TODO: This works because currently we can safely assume that
+-		 *  Compute Units are parsed before caches are parsed. In
+-		 *  future, remove this dependency
+-		 */
+-		if ((id >= dev->node_props.cpu_core_id_base &&
+-			id <= dev->node_props.cpu_core_id_base +
+-				dev->node_props.cpu_cores_count) ||
+-			(id >= dev->node_props.simd_id_base &&
+-			id < dev->node_props.simd_id_base +
+-				total_num_of_cu)) {
+-			props = kfd_alloc_struct(props);
+-			if (!props)
+-				return -ENOMEM;
+-
+-			props->processor_id_low = id;
+-			props->cache_level = cache->cache_level;
+-			props->cache_size = cache->cache_size;
+-			props->cacheline_size = cache->cache_line_size;
+-			props->cachelines_per_tag = cache->lines_per_tag;
+-			props->cache_assoc = cache->associativity;
+-			props->cache_latency = cache->cache_latency;
+-			memcpy(props->sibling_map, cache->sibling_map,
+-					sizeof(props->sibling_map));
+-
+-			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_DATA;
+-			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+-			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_CPU;
+-			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+-				props->cache_type |= HSA_CACHE_TYPE_HSACU;
+-
+-			dev->cache_count++;
+-			dev->node_props.caches_count++;
+-			list_add_tail(&props->list, &dev->cache_props);
+-
+-			break;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+-/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
+- * topology device present in the device_list
+- */
+-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
+-					struct list_head *device_list)
+-{
+-	struct kfd_iolink_properties *props = NULL, *props2;
+-	struct kfd_topology_device *dev, *cpu_dev;
+-	uint32_t id_from;
+-	uint32_t id_to;
+-
+-	id_from = iolink->proximity_domain_from;
+-	id_to = iolink->proximity_domain_to;
+-
+-	pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
+-			id_from);
+-	list_for_each_entry(dev, device_list, list) {
+-		if (id_from == dev->proximity_domain) {
+-			props = kfd_alloc_struct(props);
+-			if (!props)
+-				return -ENOMEM;
+-
+-			props->node_from = id_from;
+-			props->node_to = id_to;
+-			props->ver_maj = iolink->version_major;
+-			props->ver_min = iolink->version_minor;
+-			props->iolink_type = iolink->io_interface_type;
+-
+-			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
+-				props->weight = 20;
+-			else
+-				props->weight = node_distance(id_from, id_to);
+-
+-			props->min_latency = iolink->minimum_latency;
+-			props->max_latency = iolink->maximum_latency;
+-			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
+-			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
+-			props->rec_transfer_size =
+-					iolink->recommended_transfer_size;
+-
+-			dev->io_link_count++;
+-			dev->node_props.io_links_count++;
+-			list_add_tail(&props->list, &dev->io_link_props);
+-			break;
+-		}
+-	}
+-
+-	/* CPU topology is created before GPUs are detected, so CPU->GPU
+-	 * links are not built at that time. If a PCIe type is discovered, it
+-	 * means a GPU is detected and we are adding GPU->CPU to the topology.
+-	 * At this time, also add the corresponded CPU->GPU link.
+-	 */
+-	if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
+-		cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
+-		if (!cpu_dev)
+-			return -ENODEV;
+-		/* same everything but the other direction */
+-		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
+-		props2->node_from = id_to;
+-		props2->node_to = id_from;
+-		props2->kobj = NULL;
+-		cpu_dev->io_link_count++;
+-		cpu_dev->node_props.io_links_count++;
+-		list_add_tail(&props2->list, &cpu_dev->io_link_props);
+-	}
+-
+-	return 0;
+-}
+-
+-/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
+- * present in the device_list
+- *	@sub_type_hdr - subtype section of crat_image
+- *	@device_list - list of topology devices present in this crat_image
+- */
+-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+-				struct list_head *device_list)
+-{
+-	struct crat_subtype_computeunit *cu;
+-	struct crat_subtype_memory *mem;
+-	struct crat_subtype_cache *cache;
+-	struct crat_subtype_iolink *iolink;
+-	int ret = 0;
+-
+-	switch (sub_type_hdr->type) {
+-	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
+-		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+-		ret = kfd_parse_subtype_cu(cu, device_list);
+-		break;
+-	case CRAT_SUBTYPE_MEMORY_AFFINITY:
+-		mem = (struct crat_subtype_memory *)sub_type_hdr;
+-		ret = kfd_parse_subtype_mem(mem, device_list);
+-		break;
+-	case CRAT_SUBTYPE_CACHE_AFFINITY:
+-		cache = (struct crat_subtype_cache *)sub_type_hdr;
+-		ret = kfd_parse_subtype_cache(cache, device_list);
+-		break;
+-	case CRAT_SUBTYPE_TLB_AFFINITY:
+-		/* For now, nothing to do here */
+-		pr_debug("Found TLB entry in CRAT table (not processing)\n");
+-		break;
+-	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+-		/* For now, nothing to do here */
+-		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
+-		break;
+-	case CRAT_SUBTYPE_IOLINK_AFFINITY:
+-		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
+-		ret = kfd_parse_subtype_iolink(iolink, device_list);
+-		break;
+-	default:
+-		pr_warn("Unknown subtype %d in CRAT\n",
+-				sub_type_hdr->type);
+-	}
+-
+-	return ret;
+-}
+-
+-/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
+- * create a kfd_topology_device and add in to device_list. Also parse
+- * CRAT subtypes and attach it to appropriate kfd_topology_device
+- *	@crat_image - input image containing CRAT
+- *	@device_list - [OUT] list of kfd_topology_device generated after
+- *		       parsing crat_image
+- *	@proximity_domain - Proximity domain of the first device in the table
+- *
+- *	Return - 0 if successful else -ve value
+- */
+-int kfd_parse_crat_table(void *crat_image,
+-				struct list_head *device_list,
+-				uint32_t proximity_domain)
+-{
+-	struct kfd_topology_device *top_dev = NULL;
+-	struct crat_subtype_generic *sub_type_hdr;
+-	uint16_t node_id;
+-	int ret = 0;
+-	struct crat_header *crat_table = (struct crat_header *)crat_image;
+-	uint16_t num_nodes;
+-	uint32_t image_len;
+-	uint32_t last_header_type, last_header_length;
+-
+-	if (!crat_image)
+-		return -EINVAL;
+-
+-	if (!list_empty(device_list)) {
+-		pr_warn("Error device list should be empty\n");
+-		return -EINVAL;
+-	}
+-
+-	num_nodes = crat_table->num_domains;
+-	image_len = crat_table->length;
+-
+-	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+-
+-	for (node_id = 0; node_id < num_nodes; node_id++) {
+-		top_dev = kfd_create_topology_device(device_list);
+-		if (!top_dev)
+-			break;
+-		top_dev->proximity_domain = proximity_domain++;
+-	}
+-
+-	if (!top_dev) {
+-		ret = -ENOMEM;
+-		goto err;
+-	}
+-
+-	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
+-	memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
+-			CRAT_OEMTABLEID_LENGTH);
+-	top_dev->oem_revision = crat_table->oem_revision;
+-
+-	last_header_type = last_header_length = 0;
+-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+-	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
+-			((char *)crat_image) + image_len) {
+-		pr_debug("Parsing CRAT subtype header %p enabled: %s type: 0x%x length %d\n",
+-				sub_type_hdr,
+-				(sub_type_hdr->flags &
+-					CRAT_SUBTYPE_FLAGS_ENABLED)
+-					? "true" : "false",
+-				sub_type_hdr->type,
+-				sub_type_hdr->length);
+-
+-		if (sub_type_hdr->length == 0) {
+-			pr_err("Parsing wrong CRAT's subtype header last header type: %d last header len %d\n",
+-				last_header_type, last_header_type);
+-			pr_err("Current header type %d length %d\n",
+-				sub_type_hdr->type, sub_type_hdr->length);
+-			break;
+-		}
+-
+-		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
+-			ret = kfd_parse_subtype(sub_type_hdr, device_list);
+-			if (ret != 0)
+-				break;
+-		}
+-
+-		last_header_type = sub_type_hdr->type;
+-		last_header_length = sub_type_hdr->length;
+-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-				sub_type_hdr->length);
+-	}
+-
+-err:
+-	if (ret)
+-		kfd_release_topology_device_list(device_list);
+-
+-	return ret;
+-}
+-
+-/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+-static int fill_in_pcache(struct crat_subtype_cache *pcache,
+-				struct kfd_gpu_cache_info *pcache_info,
+-				struct kfd_cu_info *cu_info,
+-				int mem_available,
+-				int cu_bitmask,
+-				int cache_type, unsigned int cu_processor_id,
+-				int cu_block)
+-{
+-	unsigned int cu_sibling_map_mask;
+-	int first_active_cu;
+-
+-	/* First check if enough memory is available */
+-	if (sizeof(struct crat_subtype_cache) > mem_available)
+-		return -ENOMEM;
+-
+-	cu_sibling_map_mask = cu_bitmask;
+-	cu_sibling_map_mask >>= cu_block;
+-	cu_sibling_map_mask &=
+-		((1 << pcache_info[cache_type].num_cu_shared) - 1);
+-	first_active_cu = ffs(cu_sibling_map_mask);
+-
+-	/* CU could be inactive. In case of shared cache find the first active
+-	 * CU. and incase of non-shared cache check if the CU is inactive. If
+-	 * inactive active skip it
+-	 */
+-	if (first_active_cu) {
+-		memset(pcache, 0, sizeof(struct crat_subtype_cache));
+-		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+-		pcache->length = sizeof(struct crat_subtype_cache);
+-		pcache->flags = pcache_info[cache_type].flags;
+-		pcache->processor_id_low = cu_processor_id
+-					 + (first_active_cu - 1);
+-		pcache->cache_level = pcache_info[cache_type].cache_level;
+-		pcache->cache_size = pcache_info[cache_type].cache_size;
+-
+-		/* Sibling map is w.r.t processor_id_low, so shift out
+-		 * inactive CU
+-		 */
+-		cu_sibling_map_mask =
+-			cu_sibling_map_mask >> (first_active_cu - 1);
+-
+-		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+-		pcache->sibling_map[1] =
+-				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+-		pcache->sibling_map[2] =
+-				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+-		pcache->sibling_map[3] =
+-				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+-		return 0;
+-	}
+-	return 1;
+-}
+-
+-/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
+- * tables
+- *
+- *	@kdev - [IN] GPU device
+- *	@gpu_processor_id - [IN] GPU processor ID to which these caches
+- *			    associate
+- *	@available_size - [IN] Amount of memory available in pcache
+- *	@cu_info - [IN] Compute Unit info obtained from KGD
+- *	@pcache - [OUT] memory into which cache data is to be filled in.
+- *	@size_filled - [OUT] amount of data used up in pcache.
+- *	@num_of_entries - [OUT] number of caches added
+- */
+-static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+-			int gpu_processor_id,
+-			int available_size,
+-			struct kfd_cu_info *cu_info,
+-			struct crat_subtype_cache *pcache,
+-			int *size_filled,
+-			int *num_of_entries)
+-{
+-	struct kfd_gpu_cache_info *pcache_info;
+-	int num_of_cache_types = 0;
+-	int i, j, k;
+-	int ct = 0;
+-	int mem_available = available_size;
+-	unsigned int cu_processor_id;
+-	int ret;
+-
+-	switch (kdev->device_info->asic_family) {
+-	case CHIP_KAVERI:
+-		pcache_info = kaveri_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
+-		break;
+-	case CHIP_HAWAII:
+-		pcache_info = hawaii_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
+-		break;
+-	case CHIP_CARRIZO:
+-		pcache_info = carrizo_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
+-		break;
+-	case CHIP_TONGA:
+-		pcache_info = tonga_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
+-		break;
+-	case CHIP_FIJI:
+-		pcache_info = fiji_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
+-		break;
+-	case CHIP_POLARIS10:
+-		pcache_info = polaris10_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
+-		break;
+-	case CHIP_POLARIS11:
+-		pcache_info = polaris11_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
+-		break;
+-	case CHIP_VEGA10:
+-		pcache_info = vega10_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
+-		break;
+-	case CHIP_RAVEN:
+-		pcache_info = raven_cache_info;
+-		num_of_cache_types = ARRAY_SIZE(raven_cache_info);
+-		break;
+-	default:
+-		return -EINVAL;
+-	}
+-
+-	*size_filled = 0;
+-	*num_of_entries = 0;
+-
+-	/* For each type of cache listed in the kfd_gpu_cache_info table,
+-	 * go through all available Compute Units.
+-	 * The [i,j,k] loop will
+-	 *		if kfd_gpu_cache_info.num_cu_shared = 1
+-	 *			will parse through all available CU
+-	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
+-	 *			then it will consider only one CU from
+-	 *			the shared unit
+-	 */
+-
+-	for (ct = 0; ct < num_of_cache_types; ct++) {
+-		cu_processor_id = gpu_processor_id;
+-		for (i = 0; i < cu_info->num_shader_engines; i++) {
+-			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
+-				j++) {
+-				for (k = 0; k < cu_info->num_cu_per_sh;
+-					k += pcache_info[ct].num_cu_shared) {
+-
+-					ret = fill_in_pcache(pcache,
+-						pcache_info,
+-						cu_info,
+-						mem_available,
+-						cu_info->cu_bitmap[i][j],
+-						ct,
+-						cu_processor_id,
+-						k);
+-
+-					if (ret < 0)
+-						break;
+-
+-					if (!ret) {
+-						pcache++;
+-						(*num_of_entries)++;
+-						mem_available -=
+-							sizeof(*pcache);
+-						(*size_filled) +=
+-							sizeof(*pcache);
+-					}
+-
+-					/* Move to next CU block */
+-					cu_processor_id +=
+-						pcache_info[ct].num_cu_shared;
+-				}
+-			}
+-		}
+-	}
+-
+-	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
+-
+-	return 0;
+-}
+-
+-/*
+- * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
+- * copies CRAT from ACPI (if available).
+- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+- *
+- *	@crat_image: CRAT read from ACPI. If no CRAT in ACPI then
+- *		     crat_image will be NULL
+- *	@size: [OUT] size of crat_image
+- *
+- *	Return 0 if successful else return -ve value
+- */
+-#ifdef CONFIG_ACPI
+-int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+-{
+-	struct acpi_table_header *crat_table;
+-	acpi_status status;
+-	void *pcrat_image;
+-
+-	if (!crat_image)
+-		return -EINVAL;
+-
+-	*crat_image = NULL;
+-
+-	/* Fetch the CRAT table from ACPI */
+-	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
+-	if (status == AE_NOT_FOUND) {
+-		pr_warn("CRAT table not found\n");
+-		return -ENODATA;
+-	} else if (ACPI_FAILURE(status)) {
+-		const char *err = acpi_format_exception(status);
+-
+-		pr_err("CRAT table error: %s\n", err);
+-		return -EINVAL;
+-	}
+-
+-	if (ignore_crat) {
+-		pr_info("CRAT table disabled by module option\n");
+-		return -ENODATA;
+-	}
+-
+-	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
+-	if (!pcrat_image) {
+-		pr_err("No memory for allocating CRAT image\n");
+-		return -ENOMEM;
+-	}
+-
+-	memcpy(pcrat_image, crat_table, crat_table->length);
+-
+-	*crat_image = pcrat_image;
+-	*size = crat_table->length;
+-
+-	return 0;
+-}
+-#endif
+-
+-/* Memory required to create Virtual CRAT.
+- * Since there is no easy way to predict the amount of memory required, the
+- * following amount are allocated for CPU and GPU Virtual CRAT. This is
+- * expected to cover all known conditions. But to be safe additional check
+- * is put in the code to ensure we don't overwrite.
+- */
+-#define VCRAT_SIZE_FOR_CPU	(2 * PAGE_SIZE)
+-#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
+-
+-/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
+- *
+- *	@numa_node_id: CPU NUMA node id
+- *	@avail_size: Available size in the memory
+- *	@sub_type_hdr: Memory into which compute info will be filled in
+- *
+- *	Return 0 if successful else return -ve value
+- */
+-static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
+-				int proximity_domain,
+-				struct crat_subtype_computeunit *sub_type_hdr)
+-{
+-	const struct cpumask *cpumask;
+-
+-	*avail_size -= sizeof(struct crat_subtype_computeunit);
+-	if (*avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+-
+-	/* Fill in subtype header data */
+-	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+-	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+-	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-	cpumask = cpumask_of_node(numa_node_id);
+-
+-	/* Fill in CU data */
+-	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
+-	sub_type_hdr->proximity_domain = proximity_domain;
+-	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
+-	if (sub_type_hdr->processor_id_low == -1)
+-		return -EINVAL;
+-
+-	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
+-
+-	return 0;
+-}
+-
+-/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
+- *
+- *	@numa_node_id: CPU NUMA node id
+- *	@avail_size: Available size in the memory
+- *	@sub_type_hdr: Memory into which compute info will be filled in
+- *
+- *	Return 0 if successful else return -ve value
+- */
+-static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
+-			int proximity_domain,
+-			struct crat_subtype_memory *sub_type_hdr)
+-{
+-	uint64_t mem_in_bytes = 0;
+-	pg_data_t *pgdat;
+-	int zone_type;
+-
+-	*avail_size -= sizeof(struct crat_subtype_memory);
+-	if (*avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+-
+-	/* Fill in subtype header data */
+-	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+-	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+-	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-	/* Fill in Memory Subunit data */
+-
+-	/* Unlike si_meminfo, si_meminfo_node is not exported. So
+-	 * the following lines are duplicated from si_meminfo_node
+-	 * function
+-	 */
+-	pgdat = NODE_DATA(numa_node_id);
+-	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+-		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+-	mem_in_bytes <<= PAGE_SHIFT;
+-
+-	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
+-	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
+-	sub_type_hdr->proximity_domain = proximity_domain;
+-
+-	return 0;
+-}
+-
+-#ifdef CONFIG_X86_64
+-static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
+-				uint32_t *num_entries,
+-				struct crat_subtype_iolink *sub_type_hdr)
+-{
+-	int nid;
+-	struct cpuinfo_x86 *c = &cpu_data(0);
+-	uint8_t link_type;
+-
+-	if (c->x86_vendor == X86_VENDOR_AMD)
+-		link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
+-	else
+-		link_type = CRAT_IOLINK_TYPE_QPI_1_1;
+-
+-	*num_entries = 0;
+-
+-	/* Create IO links from this node to other CPU nodes */
+-	for_each_online_node(nid) {
+-		if (nid == numa_node_id) /* node itself */
+-			continue;
+-
+-		*avail_size -= sizeof(struct crat_subtype_iolink);
+-		if (*avail_size < 0)
+-			return -ENOMEM;
+-
+-		memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+-
+-		/* Fill in subtype header data */
+-		sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+-		sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+-		sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-		/* Fill in IO link data */
+-		sub_type_hdr->proximity_domain_from = numa_node_id;
+-		sub_type_hdr->proximity_domain_to = nid;
+-		sub_type_hdr->io_interface_type = link_type;
+-
+-		(*num_entries)++;
+-		sub_type_hdr++;
+-	}
+-
+-	return 0;
+-}
+-#endif
+-
+-/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
+- *
+- *	@pcrat_image: Fill in VCRAT for CPU
+- *	@size:	[IN] allocated size of crat_image.
+- *		[OUT] actual size of data filled in crat_image
+- */
+-static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
+-{
+-	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+-	struct crat_subtype_generic *sub_type_hdr;
+-	int avail_size = *size;
+-	int numa_node_id;
+-	int ret = 0;
+-#ifdef CONFIG_ACPI
+-	struct acpi_table_header *acpi_table;
+-	acpi_status status;
+-#endif
+-#ifdef CONFIG_X86_64
+-	uint32_t entries = 0;
+-#endif
+-
+-	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU)
+-		return -EINVAL;
+-
+-	/* Fill in CRAT Header.
+-	 * Modify length and total_entries as subunits are added.
+-	 */
+-	avail_size -= sizeof(struct crat_header);
+-	if (avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset(crat_table, 0, sizeof(struct crat_header));
+-	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+-			sizeof(crat_table->signature));
+-	crat_table->length = sizeof(struct crat_header);
+-
+-#ifdef CONFIG_ACPI
+-	status = acpi_get_table("DSDT", 0, &acpi_table);
+-	if (status == AE_NOT_FOUND)
+-		pr_warn("DSDT table not found for OEM information\n");
+-	else {
+-		crat_table->oem_revision = acpi_table->revision;
+-		memcpy(crat_table->oem_id, acpi_table->oem_id,
+-				CRAT_OEMID_LENGTH);
+-		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
+-				CRAT_OEMTABLEID_LENGTH);
+-	}
+-#else
+-	crat_table->oem_revision = 0;
+-	memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH);
+-	memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH);
+-#endif
+-	crat_table->total_entries = 0;
+-	crat_table->num_domains = 0;
+-
+-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+-
+-	for_each_online_node(numa_node_id) {
+-		if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
+-			continue;
+-
+-		/* Fill in Subtype: Compute Unit */
+-		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
+-			crat_table->num_domains,
+-			(struct crat_subtype_computeunit *)sub_type_hdr);
+-		if (ret < 0)
+-			return ret;
+-		crat_table->length += sub_type_hdr->length;
+-		crat_table->total_entries++;
+-
+-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-			sub_type_hdr->length);
+-
+-		/* Fill in Subtype: Memory */
+-		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
+-			crat_table->num_domains,
+-			(struct crat_subtype_memory *)sub_type_hdr);
+-		if (ret < 0)
+-			return ret;
+-		crat_table->length += sub_type_hdr->length;
+-		crat_table->total_entries++;
+-
+-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-			sub_type_hdr->length);
+-
+-		/* Fill in Subtype: IO Link */
+-#ifdef CONFIG_X86_64
+-		ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
+-				&entries,
+-				(struct crat_subtype_iolink *)sub_type_hdr);
+-		if (ret < 0)
+-			return ret;
+-		crat_table->length += (sub_type_hdr->length * entries);
+-		crat_table->total_entries += entries;
+-
+-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-				sub_type_hdr->length * entries);
+-#else
+-		pr_info("IO link not available for non x86 platforms\n");
+-#endif
+-
+-		crat_table->num_domains++;
+-	}
+-
+-	/* TODO: Add cache Subtype for CPU.
+-	 * Currently, CPU cache information is available in function
+-	 * detect_cache_attributes(cpu) defined in the file
+-	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
+-	 * exported and to get the same information the code needs to be
+-	 * duplicated.
+-	 */
+-
+-	*size = crat_table->length;
+-	pr_info("Virtual CRAT table created for CPU\n");
+-
+-	return 0;
+-}
+-
+-static int kfd_fill_gpu_memory_affinity(int *avail_size,
+-		struct kfd_dev *kdev, uint8_t type, uint64_t size,
+-		struct crat_subtype_memory *sub_type_hdr,
+-		uint32_t proximity_domain,
+-		const struct kfd_local_mem_info *local_mem_info)
+-{
+-	*avail_size -= sizeof(struct crat_subtype_memory);
+-	if (*avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+-	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+-	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+-	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-	sub_type_hdr->proximity_domain = proximity_domain;
+-
+-	pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
+-			type, size);
+-
+-	sub_type_hdr->length_low = lower_32_bits(size);
+-	sub_type_hdr->length_high = upper_32_bits(size);
+-
+-	sub_type_hdr->width = local_mem_info->vram_width;
+-	sub_type_hdr->visibility_type = type;
+-
+-	return 0;
+-}
+-
+-/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
+- * to its NUMA node
+- *	@avail_size: Available size in the memory
+- *	@kdev - [IN] GPU device
+- *	@sub_type_hdr: Memory into which io link info will be filled in
+- *	@proximity_domain - proximity domain of the GPU node
+- *
+- *	Return 0 if successful else return -ve value
+- */
+-static int kfd_fill_gpu_direct_io_link(int *avail_size,
+-			struct kfd_dev *kdev,
+-			struct crat_subtype_iolink *sub_type_hdr,
+-			uint32_t proximity_domain)
+-{
+-	*avail_size -= sizeof(struct crat_subtype_iolink);
+-	if (*avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+-
+-	/* Fill in subtype header data */
+-	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+-	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+-	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-	/* Fill in IOLINK subtype.
+-	 * TODO: Fill-in other fields of iolink subtype
+-	 */
+-	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+-	sub_type_hdr->proximity_domain_from = proximity_domain;
+-#ifdef CONFIG_NUMA
+-	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
+-		sub_type_hdr->proximity_domain_to = 0;
+-	else
+-		sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
+-#else
+-	sub_type_hdr->proximity_domain_to = 0;
+-#endif
+-	return 0;
+-}
+-
+-/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
+- *
+- *	@pcrat_image: Fill in VCRAT for GPU
+- *	@size:	[IN] allocated size of crat_image.
+- *		[OUT] actual size of data filled in crat_image
+- */
+-static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+-			size_t *size, struct kfd_dev *kdev,
+-			uint32_t proximity_domain)
+-{
+-	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+-	struct crat_subtype_generic *sub_type_hdr;
+-	struct crat_subtype_computeunit *cu;
+-	struct kfd_cu_info cu_info;
+-	int avail_size = *size;
+-	uint32_t total_num_of_cu;
+-	int num_of_cache_entries = 0;
+-	int cache_mem_filled = 0;
+-	int ret = 0;
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	struct amd_iommu_device_info iommu_info;
+-	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
+-					 AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
+-					 AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+-#endif
+-	struct kfd_local_mem_info local_mem_info;
+-
+-	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
+-		return -EINVAL;
+-
+-	/* Fill the CRAT Header.
+-	 * Modify length and total_entries as subunits are added.
+-	 */
+-	avail_size -= sizeof(struct crat_header);
+-	if (avail_size < 0)
+-		return -ENOMEM;
+-
+-	memset(crat_table, 0, sizeof(struct crat_header));
+-
+-	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+-			sizeof(crat_table->signature));
+-	/* Change length as we add more subtypes*/
+-	crat_table->length = sizeof(struct crat_header);
+-	crat_table->num_domains = 1;
+-	crat_table->total_entries = 0;
+-
+-	/* Fill in Subtype: Compute Unit
+-	 * First fill in the sub type header and then sub type data
+-	 */
+-	avail_size -= sizeof(struct crat_subtype_computeunit);
+-	if (avail_size < 0)
+-		return -ENOMEM;
+-
+-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
+-	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+-
+-	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+-	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+-	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+-
+-	/* Fill CU subtype data */
+-	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+-	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
+-	cu->proximity_domain = proximity_domain;
+-
+-	kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
+-	cu->num_simd_per_cu = cu_info.simd_per_cu;
+-	cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
+-	cu->max_waves_simd = cu_info.max_waves_per_simd;
+-
+-	cu->wave_front_size = cu_info.wave_front_size;
+-	cu->array_count = cu_info.num_shader_arrays_per_engine *
+-		cu_info.num_shader_engines;
+-	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
+-	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
+-	cu->num_cu_per_array = cu_info.num_cu_per_sh;
+-	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
+-	cu->num_banks = cu_info.num_shader_engines;
+-	cu->lds_size_in_kb = cu_info.lds_size;
+-
+-	cu->hsa_capability = 0;
+-
+-	/* Check if this node supports IOMMU. During parsing this flag will
+-	 * translate to HSA_CAP_ATS_PRESENT
+-	 */
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	iommu_info.flags = 0;
+-	if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) {
+-		if ((iommu_info.flags & required_iommu_flags) ==
+-				required_iommu_flags)
+-			cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
+-	}
+-#endif
+-
+-	crat_table->length += sub_type_hdr->length;
+-	crat_table->total_entries++;
+-
+-	/* Fill in Subtype: Memory. Only on systems with large BAR (no
+-	 * private FB), report memory as public. On other systems
+-	 * report the total FB size (public+private) as a single
+-	 * private heap.
+-	 */
+-	kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
+-	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-			sub_type_hdr->length);
+-
+-	if (debug_largebar)
+-		local_mem_info.local_mem_size_private = 0;
+-
+-	if (local_mem_info.local_mem_size_private == 0)
+-		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+-				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
+-				local_mem_info.local_mem_size_public,
+-				(struct crat_subtype_memory *)sub_type_hdr,
+-				proximity_domain,
+-				&local_mem_info);
+-	else
+-		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+-				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
+-				local_mem_info.local_mem_size_public +
+-				local_mem_info.local_mem_size_private,
+-				(struct crat_subtype_memory *)sub_type_hdr,
+-				proximity_domain,
+-				&local_mem_info);
+-	if (ret < 0)
+-		return ret;
+-
+-	crat_table->length += sizeof(struct crat_subtype_memory);
+-	crat_table->total_entries++;
+-
+-	/* TODO: Fill in cache information. This information is NOT readily
+-	 * available in KGD
+-	 */
+-	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-		sub_type_hdr->length);
+-	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
+-				avail_size,
+-				&cu_info,
+-				(struct crat_subtype_cache *)sub_type_hdr,
+-				&cache_mem_filled,
+-				&num_of_cache_entries);
+-
+-	if (ret < 0)
+-		return ret;
+-
+-	crat_table->length += cache_mem_filled;
+-	crat_table->total_entries += num_of_cache_entries;
+-	avail_size -= cache_mem_filled;
+-
+-	/* Fill in Subtype: IO_LINKS
+-	 *  Only direct links are added here which is Link from GPU to
+-	 *  to its NUMA node. Indirect links are added by userspace.
+-	 */
+-	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+-		cache_mem_filled);
+-	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
+-		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
+-
+-	if (ret < 0)
+-		return ret;
+-
+-	crat_table->length += sub_type_hdr->length;
+-	crat_table->total_entries++;
+-
+-	*size = crat_table->length;
+-	pr_info("Virtual CRAT table created for GPU\n");
+-
+-	return ret;
+-}
+-
+-/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
+- *		creates a Virtual CRAT (VCRAT) image
+- *
+- * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+- *
+- *	@crat_image: VCRAT image created because ACPI does not have a
+- *		     CRAT for this device
+- *	@size: [OUT] size of virtual crat_image
+- *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
+- *		COMPUTE_UNIT_GPU - Create VCRAT for GPU
+- *		(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
+- *			-- this option is not currently implemented.
+- *			The assumption is that all AMD APUs will have CRAT
+- *	@kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
+- *
+- *	Return 0 if successful else return -ve value
+- */
+-int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+-		int flags, struct kfd_dev *kdev, uint32_t proximity_domain)
+-{
+-	void *pcrat_image = NULL;
+-	int ret = 0;
+-
+-	if (!crat_image)
+-		return -EINVAL;
+-
+-	*crat_image = NULL;
+-
+-	/* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
+-	 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
+-	 * all the current conditions. A check is put not to overwrite beyond
+-	 * allocated size
+-	 */
+-	switch (flags) {
+-	case COMPUTE_UNIT_CPU:
+-		pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
+-		if (!pcrat_image)
+-			return -ENOMEM;
+-		*size = VCRAT_SIZE_FOR_CPU;
+-		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
+-		break;
+-	case COMPUTE_UNIT_GPU:
+-		if (!kdev)
+-			return -EINVAL;
+-		pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
+-		if (!pcrat_image)
+-			return -ENOMEM;
+-		*size = VCRAT_SIZE_FOR_GPU;
+-		ret = kfd_create_vcrat_image_gpu(pcrat_image, size,
+-				kdev, proximity_domain);
+-		break;
+-	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
+-		/* TODO: */
+-		ret = -EINVAL;
+-		pr_err("VCRAT not implemented for APU\n");
+-		break;
+-	default:
+-		ret = -EINVAL;
+-	}
+-
+-	if (!ret)
+-		*crat_image = pcrat_image;
+-	else
+-		kfree(pcrat_image);
+-
+-	return ret;
+-}
+-
+-
+-/* kfd_destroy_crat_image
+- *
+- *	@crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
+- *
+- */
+-void kfd_destroy_crat_image(void *crat_image)
+-{
+-	kfree(crat_image);
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+index 00de41f..a374fa3 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+@@ -24,7 +24,6 @@
+ #define KFD_CRAT_H_INCLUDED
+ 
+ #include <linux/types.h>
+-#include "kfd_priv.h"
+ 
+ #pragma pack(1)
+ 
+@@ -45,10 +44,6 @@
+ 
+ #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
+ 
+-/* Compute Unit flags */
+-#define COMPUTE_UNIT_CPU	(1 << 0)  /* Create Virtual CRAT for CPU */
+-#define COMPUTE_UNIT_GPU	(1 << 1)  /* Create Virtual CRAT for GPU */
+-
+ struct crat_header {
+ 	uint32_t	signature;
+ 	uint32_t	length;
+@@ -110,7 +105,7 @@ struct crat_subtype_computeunit {
+ 	uint8_t		wave_front_size;
+ 	uint8_t		num_banks;
+ 	uint16_t	micro_engine_id;
+-	uint8_t		array_count;
++	uint8_t		num_arrays;
+ 	uint8_t		num_cu_per_array;
+ 	uint8_t		num_simd_per_cu;
+ 	uint8_t		max_slots_scatch_cu;
+@@ -132,14 +127,13 @@ struct crat_subtype_memory {
+ 	uint8_t		length;
+ 	uint16_t	reserved;
+ 	uint32_t	flags;
+-	uint32_t	proximity_domain;
++	uint32_t	promixity_domain;
+ 	uint32_t	base_addr_low;
+ 	uint32_t	base_addr_high;
+ 	uint32_t	length_low;
+ 	uint32_t	length_high;
+ 	uint32_t	width;
+-	uint8_t		visibility_type; /* for virtual (dGPU) CRAT */
+-	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
++	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH];
+ };
+ 
+ /*
+@@ -228,12 +222,9 @@ struct crat_subtype_ccompute {
+ /*
+  * HSA IO Link Affinity structure and definitions
+  */
+-#define CRAT_IOLINK_FLAGS_ENABLED	(1 << 0)
+-#define CRAT_IOLINK_FLAGS_NON_COHERENT	(1 << 1)
+-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2)
+-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3)
+-#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4)
+-#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0
++#define CRAT_IOLINK_FLAGS_ENABLED	0x00000001
++#define CRAT_IOLINK_FLAGS_COHERENCY	0x00000002
++#define CRAT_IOLINK_FLAGS_RESERVED	0xfffffffc
+ 
+ /*
+  * IO interface types
+@@ -241,16 +232,8 @@ struct crat_subtype_ccompute {
+ #define CRAT_IOLINK_TYPE_UNDEFINED	0
+ #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
+ #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
+-#define CRAT_IOLINK_TYPE_AMBA 3
+-#define CRAT_IOLINK_TYPE_MIPI 4
+-#define CRAT_IOLINK_TYPE_QPI_1_1 5
+-#define CRAT_IOLINK_TYPE_RESERVED1 6
+-#define CRAT_IOLINK_TYPE_RESERVED2 7
+-#define CRAT_IOLINK_TYPE_RAPID_IO 8
+-#define CRAT_IOLINK_TYPE_INFINIBAND 9
+-#define CRAT_IOLINK_TYPE_RESERVED3 10
+-#define CRAT_IOLINK_TYPE_OTHER 11
+-#define CRAT_IOLINK_TYPE_MAX 255
++#define CRAT_IOLINK_TYPE_OTHER		3
++#define CRAT_IOLINK_TYPE_MAX		255
+ 
+ #define CRAT_IOLINK_RESERVED_LENGTH 24
+ 
+@@ -308,13 +291,4 @@ struct cdit_header {
+ 
+ #pragma pack()
+ 
+-#ifdef CONFIG_ACPI
+-int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+-#endif
+-void kfd_destroy_crat_image(void *crat_image);
+-int kfd_parse_crat_table(void *crat_image,
+-		struct list_head *device_list,
+-		uint32_t proximity_domain);
+-int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+-		int flags, struct kfd_dev *kdev, uint32_t proximity_domain);
+ #endif /* KFD_CRAT_H_INCLUDED */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+index df9b346..0aa021a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+@@ -29,7 +29,7 @@
+ #include <linux/mutex.h>
+ #include <linux/device.h>
+ 
+-#include "kfd_pm4_headers_vi.h"
++#include "kfd_pm4_headers.h"
+ #include "kfd_pm4_headers_diq.h"
+ #include "kfd_kernel_queue.h"
+ #include "kfd_priv.h"
+@@ -47,10 +47,9 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev)
+ 
+ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 				unsigned int pasid, uint64_t vmid0_address,
+-				uint32_t *packet_buff, size_t size_in_bytes,
+-				bool sync)
++				uint32_t *packet_buff, size_t size_in_bytes)
+ {
+-	struct pm4_mec_release_mem *rm_packet;
++	struct pm4__release_mem *rm_packet;
+ 	struct pm4__indirect_buffer_pasid *ib_packet;
+ 	struct kfd_mem_obj *mem_obj;
+ 	size_t pq_packets_size_in_bytes;
+@@ -66,9 +65,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 
+ 	kq = dbgdev->kq;
+ 
+-	pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid);
+-	if (sync)
+-		pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem);
++	pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) +
++				sizeof(struct pm4__indirect_buffer_pasid);
+ 
+ 	/*
+ 	 * We acquire a buffer from DIQ
+@@ -97,15 +95,10 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
+ 
+ 	ib_packet->control = (1 << 23) | (1 << 31) |
+-			((size_in_bytes / 4) & 0xfffff);
++			((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
+ 
+ 	ib_packet->bitfields5.pasid = pasid;
+ 
+-	if (!sync) {
+-		kq->ops.submit_packet(kq);
+-		return status;
+-	}
+-
+ 	/*
+ 	 * for now we use release mem for GPU-CPU synchronization
+ 	 * Consider WaitRegMem + WriteData as a better alternative
+@@ -114,7 +107,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 	 * (a) Sync with HW
+ 	 * (b) Sync var is written by CP to mem.
+ 	 */
+-	rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff +
++	rm_packet = (struct pm4__release_mem *) (ib_packet_buff +
+ 			(sizeof(struct pm4__indirect_buffer_pasid) /
+ 					sizeof(unsigned int)));
+ 
+@@ -133,7 +126,8 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
+ 
+ 	rm_packet->header.opcode = IT_RELEASE_MEM;
+ 	rm_packet->header.type = PM4_TYPE_3;
+-	rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / 4 - 2;
++	rm_packet->header.count = sizeof(struct pm4__release_mem) /
++					sizeof(unsigned int) - 2;
+ 
+ 	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+ 	rm_packet->bitfields2.event_index =
+@@ -190,9 +184,9 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
+ 	struct kernel_queue *kq = NULL;
+ 	int status;
+ 
+-	properties.type = KFD_QUEUE_TYPE_DIQ;
+ 	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,
+-				&properties, &qid);
++				&properties, 0, KFD_QUEUE_TYPE_DIQ,
++				&qid);
+ 
+ 	if (status) {
+ 		pr_err("Failed to create DIQ\n");
+@@ -238,8 +232,7 @@ static void dbgdev_address_watch_set_registers(
+ 			union TCP_WATCH_ADDR_H_BITS *addrHi,
+ 			union TCP_WATCH_ADDR_L_BITS *addrLo,
+ 			union TCP_WATCH_CNTL_BITS *cntl,
+-			unsigned int index, unsigned int vmid,
+-			bool is_apu)
++			unsigned int index, unsigned int vmid)
+ {
+ 	union ULARGE_INTEGER addr;
+ 
+@@ -264,9 +257,9 @@ static void dbgdev_address_watch_set_registers(
+ 
+ 	cntl->bitfields.mode = adw_info->watch_mode[index];
+ 	cntl->bitfields.vmid = (uint32_t) vmid;
+-	/* for APU assume it is an ATC address */
+-	if (is_apu)
+-		cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
++	/* for now assume it is an ATC address */
++	cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT;
++
+ 	pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask);
+ 	pr_debug("\t\t%20s %08x\n", "set reg add high :",
+ 			addrHi->bitfields.addr);
+@@ -308,8 +301,7 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev,
+ 
+ 	for (i = 0; i < adw_info->num_watch_points; i++) {
+ 		dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
+-				&cntl, i, pdd->qpd.vmid,
+-				dbgdev->dev->device_info->is_need_iommu_device);
++						&cntl, i, pdd->qpd.vmid);
+ 
+ 		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+ 		pr_debug("\t\t%20s %08x\n", "register index :", i);
+@@ -348,9 +340,9 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+ 	union TCP_WATCH_ADDR_H_BITS addrHi;
+ 	union TCP_WATCH_ADDR_L_BITS addrLo;
+ 	union TCP_WATCH_CNTL_BITS cntl;
++	struct kfd_mem_obj *mem_obj;
+ 	unsigned int aw_reg_add_dword;
+ 	uint32_t *packet_buff_uint;
+-	uint64_t packet_buff_gpu_addr;
+ 	unsigned int i;
+ 	int status;
+ 	size_t ib_size = sizeof(struct pm4__set_config_reg) * 4;
+@@ -372,13 +364,15 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+ 		return -EINVAL;
+ 	}
+ 
+-	status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq,
+-			ib_size/sizeof(uint32_t),
+-			&packet_buff_uint, &packet_buff_gpu_addr);
++	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
++
+ 	if (status) {
+-		pr_err("Failed to allocate IB from DIQ ring\n");
++		pr_err("Failed to allocate GART memory\n");
+ 		return status;
+ 	}
++
++	packet_buff_uint = mem_obj->cpu_ptr;
++
+ 	memset(packet_buff_uint, 0, ib_size);
+ 
+ 	packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint);
+@@ -397,9 +391,12 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+ 	packets_vec[3].bitfields2.insert_vmid = 1;
+ 
+ 	for (i = 0; i < adw_info->num_watch_points; i++) {
+-		dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo,
+-				&cntl, i, vmid,
+-				dbgdev->dev->device_info->is_need_iommu_device);
++		dbgdev_address_watch_set_registers(adw_info,
++						&addrHi,
++						&addrLo,
++						&cntl,
++						i,
++						vmid);
+ 
+ 		pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *");
+ 		pr_debug("\t\t%20s %08x\n", "register index :", i);
+@@ -472,24 +469,24 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev,
+ 		status = dbgdev_diq_submit_ib(
+ 					dbgdev,
+ 					adw_info->process->pasid,
+-					packet_buff_gpu_addr,
++					mem_obj->gpu_addr,
+ 					packet_buff_uint,
+-					ib_size, true);
++					ib_size);
+ 
+ 		if (status) {
+ 			pr_err("Failed to submit IB to DIQ\n");
+-			return status;
++			break;
+ 		}
+ 	}
+ 
++	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
+ 	return status;
+ }
+ 
+ static int dbgdev_wave_control_set_registers(
+ 				struct dbg_wave_control_info *wac_info,
+ 				union SQ_CMD_BITS *in_reg_sq_cmd,
+-				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index,
+-				unsigned int asic_family)
++				union GRBM_GFX_INDEX_BITS *in_reg_gfx_index)
+ {
+ 	int status = 0;
+ 	union SQ_CMD_BITS reg_sq_cmd;
+@@ -547,25 +544,11 @@ static int dbgdev_wave_control_set_registers(
+ 
+ 	switch (wac_info->operand) {
+ 	case HSA_DBG_WAVEOP_HALT:
+-		if (asic_family == CHIP_KAVERI) {
+-			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
+-			pr_debug("Halting KV\n");
+-		} else {
+-			reg_sq_cmd.bits_sethalt.cmd  = SQ_IND_CMD_NEW_SETHALT;
+-			reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT;
+-			pr_debug("Halting CZ\n");
+-		}
++		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT;
+ 		break;
+ 
+ 	case HSA_DBG_WAVEOP_RESUME:
+-		if (asic_family == CHIP_KAVERI) {
+-			reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
+-			pr_debug("Resuming KV\n");
+-		} else {
+-			reg_sq_cmd.bits_sethalt.cmd  = SQ_IND_CMD_NEW_SETHALT;
+-			reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME;
+-			pr_debug("Resuming CZ\n");
+-		}
++		reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME;
+ 		break;
+ 
+ 	case HSA_DBG_WAVEOP_KILL:
+@@ -605,15 +588,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 	int status;
+ 	union SQ_CMD_BITS reg_sq_cmd;
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
++	struct kfd_mem_obj *mem_obj;
+ 	uint32_t *packet_buff_uint;
+-	uint64_t packet_buff_gpu_addr;
+ 	struct pm4__set_config_reg *packets_vec;
+ 	size_t ib_size = sizeof(struct pm4__set_config_reg) * 3;
+ 
+ 	reg_sq_cmd.u32All = 0;
+ 
+ 	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+-			&reg_gfx_index,	dbgdev->dev->device_info->asic_family);
++							&reg_gfx_index);
+ 	if (status) {
+ 		pr_err("Failed to set wave control registers\n");
+ 		return status;
+@@ -652,13 +635,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 
+ 	pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *");
+ 
+-	status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq,
+-			ib_size / sizeof(uint32_t),
+-			&packet_buff_uint, &packet_buff_gpu_addr);
+-	if (status) {
+-		pr_err("Failed to allocate IB from DIQ ring\n");
++	status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj);
++
++	if (status != 0) {
++		pr_err("Failed to allocate GART memory\n");
+ 		return status;
+ 	}
++
++	packet_buff_uint = mem_obj->cpu_ptr;
++
+ 	memset(packet_buff_uint, 0, ib_size);
+ 
+ 	packets_vec =  (struct pm4__set_config_reg *) packet_buff_uint;
+@@ -666,7 +651,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
+ 	packets_vec[0].header.type = PM4_TYPE_3;
+ 	packets_vec[0].bitfields2.reg_offset =
+-			GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
++			GRBM_GFX_INDEX / (sizeof(uint32_t)) -
++				USERCONFIG_REG_BASE;
+ 
+ 	packets_vec[0].bitfields2.insert_vmid = 0;
+ 	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
+@@ -674,7 +660,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 	packets_vec[1].header.count = 1;
+ 	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
+ 	packets_vec[1].header.type = PM4_TYPE_3;
+-	packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE;
++	packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
++						AMD_CONFIG_REG_BASE;
+ 
+ 	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
+ 	packets_vec[1].bitfields2.insert_vmid = 1;
+@@ -690,7 +677,8 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 
+ 	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
+ 	packets_vec[2].bitfields2.reg_offset =
+-				GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
++				GRBM_GFX_INDEX / (sizeof(uint32_t)) -
++					USERCONFIG_REG_BASE;
+ 
+ 	packets_vec[2].bitfields2.insert_vmid = 0;
+ 	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
+@@ -698,13 +686,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
+ 	status = dbgdev_diq_submit_ib(
+ 			dbgdev,
+ 			wac_info->process->pasid,
+-			packet_buff_gpu_addr,
++			mem_obj->gpu_addr,
+ 			packet_buff_uint,
+-			ib_size, false);
++			ib_size);
+ 
+ 	if (status)
+ 		pr_err("Failed to submit IB to DIQ\n");
+ 
++	kfd_gtt_sa_free(dbgdev->dev, mem_obj);
++
+ 	return status;
+ }
+ 
+@@ -726,7 +716,7 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
+ 		return -EFAULT;
+ 	}
+ 	status = dbgdev_wave_control_set_registers(wac_info, &reg_sq_cmd,
+-			&reg_gfx_index,	dbgdev->dev->device_info->asic_family);
++							&reg_gfx_index);
+ 	if (status) {
+ 		pr_err("Failed to set wave control registers\n");
+ 		return status;
+@@ -779,8 +769,13 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
+ 	struct kfd_process_device *pdd;
+ 	struct dbg_wave_control_info wac_info;
+-	int first_vmid_to_scan = dev->vm_info.first_vmid_kfd;
+-	int last_vmid_to_scan = dev->vm_info.last_vmid_kfd;
++	int temp;
++	int first_vmid_to_scan = 8;
++	int last_vmid_to_scan = 15;
++
++	first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
++	temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
++	last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
+ 
+ 	reg_sq_cmd.u32All = 0;
+ 	status = 0;
+@@ -818,7 +813,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+ 		return -EFAULT;
+ 
+ 	status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
+-			&reg_gfx_index, dev->device_info->asic_family);
++			&reg_gfx_index);
+ 	if (status != 0)
+ 		return -EINVAL;
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+index 583aaa9..03424c2 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h
+@@ -60,24 +60,6 @@ enum {
+ 	SH_REG_SIZE = SH_REG_END - SH_REG_BASE
+ };
+ 
+-/* SQ_CMD definitions */
+-
+-enum {
+-	SQ_IND_CMD_DATA_RESUME = 0,
+-	SQ_IND_CMD_DATA_HALT = 1
+-};
+-
+-enum SQ_IND_CMD_NEW {
+-	SQ_IND_CMD_NEW_NULL = 0x00000000,
+-	SQ_IND_CMD_NEW_SETHALT = 0x00000001,
+-	SQ_IND_CMD_NEW_SAVECTX = 0x00000002,
+-	SQ_IND_CMD_NEW_KILL = 0x00000003,
+-	SQ_IND_CMD_NEW_DEBUG = 0x00000004,
+-	SQ_IND_CMD_NEW_TRAP = 0x00000005,
+-	SQ_IND_CMD_NEW_SET_PRIO = 0x00000006
+-
+-};
+-
+ enum SQ_IND_CMD_CMD {
+ 	SQ_IND_CMD_CMD_NULL = 0x00000000,
+ 	SQ_IND_CMD_CMD_HALT = 0x00000001,
+@@ -136,20 +118,6 @@ union SQ_CMD_BITS {
+ 		 uint32_t:1;
+ 		uint32_t vm_id:4;
+ 	} bitfields, bits;
+-	struct {
+-		uint32_t cmd:3;
+-		 uint32_t:1;
+-		uint32_t mode:3;
+-		uint32_t check_vmid:1;
+-		uint32_t data:3;
+-		 uint32_t:5;
+-		uint32_t wave_id:4;
+-		uint32_t simd_id:2;
+-		 uint32_t:2;
+-		uint32_t queue_id:3;
+-		 uint32_t:1;
+-		uint32_t vm_id:4;
+-	} bitfields_sethalt, bits_sethalt;
+ 	uint32_t u32All;
+ 	signed int i32All;
+ 	float f32All;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+index 9d4af96..3da25f7 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+@@ -33,7 +33,6 @@
+ #include "kfd_pm4_headers_diq.h"
+ #include "kfd_dbgmgr.h"
+ #include "kfd_dbgdev.h"
+-#include "kfd_device_queue_manager.h"
+ 
+ static DEFINE_MUTEX(kfd_dbgmgr_mutex);
+ 
+@@ -84,7 +83,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
+ 	}
+ 
+ 	/* get actual type of DBGDevice cpsch or not */
+-	if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
++	if (sched_policy == KFD_SCHED_POLICY_NO_HWS)
+ 		type = DBGDEV_TYPE_NODIQ;
+ 
+ 	kfd_dbgdev_init(new_buff->dbgdev, pdev, type);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+deleted file mode 100644
+index 232e28f..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
++++ /dev/null
+@@ -1,75 +0,0 @@
+-/*
+- * Copyright 2014 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#include <linux/debugfs.h>
+-#include "kfd_priv.h"
+-
+-static struct dentry *debugfs_root;
+-
+-static int kfd_debugfs_open(struct inode *inode, struct file *file)
+-{
+-	int (*show)(struct seq_file *, void *) = inode->i_private;
+-
+-	return single_open(file, show, NULL);
+-}
+-
+-static const struct file_operations kfd_debugfs_fops = {
+-	.owner = THIS_MODULE,
+-	.open = kfd_debugfs_open,
+-	.read = seq_read,
+-	.llseek = seq_lseek,
+-	.release = single_release,
+-};
+-
+-void kfd_debugfs_init(void)
+-{
+-	struct dentry *ent;
+-
+-	debugfs_root = debugfs_create_dir("kfd", NULL);
+-	if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
+-		pr_warn("Failed to create kfd debugfs dir\n");
+-		return;
+-	}
+-
+-	ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
+-				  kfd_debugfs_mqds_by_process,
+-				  &kfd_debugfs_fops);
+-	if (!ent)
+-		pr_warn("Failed to create mqds in kfd debugfs\n");
+-
+-	ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
+-				  kfd_debugfs_hqds_by_device,
+-				  &kfd_debugfs_fops);
+-	if (!ent)
+-		pr_warn("Failed to create hqds in kfd debugfs\n");
+-
+-	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+-				  kfd_debugfs_rls_by_device,
+-				  &kfd_debugfs_fops);
+-	if (!ent)
+-		pr_warn("Failed to create rls in kfd debugfs\n");
+-}
+-
+-void kfd_debugfs_fini(void)
+-{
+-	debugfs_remove_recursive(debugfs_root);
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+index f701b4e..61fff25 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -20,206 +20,36 @@
+  * OTHER DEALINGS IN THE SOFTWARE.
+  */
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ #include <linux/amd-iommu.h>
+-#endif
++#include <linux/bsearch.h>
+ #include <linux/pci.h>
+ #include <linux/slab.h>
+ #include "kfd_priv.h"
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_pm4_headers_vi.h"
+-#include "cwsr_trap_handler_gfx8.asm"
+-#include "cwsr_trap_handler_gfx9.asm"
+ 
+ #define MQD_SIZE_ALIGNED 768
+-static atomic_t kfd_device_suspended = ATOMIC_INIT(0);
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ static const struct kfd_device_info kaveri_device_info = {
+ 	.asic_family = CHIP_KAVERI,
+ 	.max_pasid_bits = 16,
+ 	/* max num of queues for KV.TODO should be a dynamic value */
+ 	.max_no_of_hqd	= 24,
+-	.doorbell_size  = 4,
+ 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+ 	.event_interrupt_class = &event_interrupt_class_cik,
+ 	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = true,
+-	.supports_cwsr = false,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
++	.mqd_size_aligned = MQD_SIZE_ALIGNED
+ };
+-#endif
+ 
+-static const struct kfd_device_info hawaii_device_info = {
+-	.asic_family = CHIP_HAWAII,
+-	.max_pasid_bits = 16,
+-	/* max num of queues for KV.TODO should be a dynamic value */
+-	.max_no_of_hqd	= 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = false,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
+-};
+-
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ static const struct kfd_device_info carrizo_device_info = {
+ 	.asic_family = CHIP_CARRIZO,
+ 	.max_pasid_bits = 16,
+ 	/* max num of queues for CZ.TODO should be a dynamic value */
+ 	.max_no_of_hqd	= 24,
+-	.doorbell_size  = 4,
+ 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+ 	.event_interrupt_class = &event_interrupt_class_cik,
+ 	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = true,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
+-};
+-#endif
+-
+-static const struct kfd_device_info tonga_device_info = {
+-	.asic_family = CHIP_TONGA,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = false,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info fiji_device_info = {
+-	.asic_family = CHIP_FIJI,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info fiji_vf_device_info = {
+-	.asic_family = CHIP_FIJI,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
+-};
+-
+-
+-static const struct kfd_device_info polaris10_device_info = {
+-	.asic_family = CHIP_POLARIS10,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info polaris10_vf_device_info = {
+-	.asic_family = CHIP_POLARIS10,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info polaris11_device_info = {
+-	.asic_family = CHIP_POLARIS11,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 4,
+-	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_cik,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info vega10_device_info = {
+-	.asic_family = CHIP_VEGA10,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 8,
+-	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_v9,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info vega10_vf_device_info = {
+-	.asic_family = CHIP_VEGA10,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 8,
+-	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_v9,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = false,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = false,
+-	.num_sdma_engines = 2,
+-};
+-
+-static const struct kfd_device_info raven_device_info = {
+-	.asic_family = CHIP_RAVEN,
+-	.max_pasid_bits = 16,
+-	.max_no_of_hqd  = 24,
+-	.doorbell_size  = 8,
+-	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+-	.event_interrupt_class = &event_interrupt_class_v9,
+-	.num_of_watch_points = 4,
+-	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+-	.is_need_iommu_device = true,
+-	.supports_cwsr = true,
+-	.needs_pci_atomics = true,
+-	.num_sdma_engines = 1,
++	.mqd_size_aligned = MQD_SIZE_ALIGNED
+ };
+ 
+ struct kfd_deviceid {
+@@ -229,7 +59,6 @@ struct kfd_deviceid {
+ 
+ /* Please keep this sorted by increasing device id. */
+ static const struct kfd_deviceid supported_devices[] = {
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ 	{ 0x1304, &kaveri_device_info },	/* Kaveri */
+ 	{ 0x1305, &kaveri_device_info },	/* Kaveri */
+ 	{ 0x1306, &kaveri_device_info },	/* Kaveri */
+@@ -252,76 +81,17 @@ static const struct kfd_deviceid supported_devices[] = {
+ 	{ 0x131B, &kaveri_device_info },	/* Kaveri */
+ 	{ 0x131C, &kaveri_device_info },	/* Kaveri */
+ 	{ 0x131D, &kaveri_device_info },	/* Kaveri */
+-#endif
+-	{ 0x67A0, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67A1, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67A2, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67A8, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67A9, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67AA, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67B0, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67B1, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67B8, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67B9, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67BA, &hawaii_device_info },	/* Hawaii */
+-	{ 0x67BE, &hawaii_device_info },	/* Hawaii */
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ 	{ 0x9870, &carrizo_device_info },	/* Carrizo */
+ 	{ 0x9874, &carrizo_device_info },	/* Carrizo */
+ 	{ 0x9875, &carrizo_device_info },	/* Carrizo */
+ 	{ 0x9876, &carrizo_device_info },	/* Carrizo */
+-	{ 0x9877, &carrizo_device_info },	/* Carrizo */
+-#endif
+-	{ 0x6920, &tonga_device_info   },	/* Tonga */
+-	{ 0x6921, &tonga_device_info   },	/* Tonga */
+-	{ 0x6928, &tonga_device_info   },	/* Tonga */
+-	{ 0x6929, &tonga_device_info   },	/* Tonga */
+-	{ 0x692B, &tonga_device_info   },	/* Tonga */
+-	{ 0x6938, &tonga_device_info   },	/* Tonga */
+-	{ 0x6939, &tonga_device_info   },	/* Tonga */
+-	{ 0x7300, &fiji_device_info    },	/* Fiji */
+-	{ 0x730F, &fiji_vf_device_info    },	/* Fiji vf*/
+-	{ 0x67C0, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67C1, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67C2, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67C4, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C7, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67C8, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67C9, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67CA, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67CC, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67CF, &polaris10_device_info },     /* Polaris10 */
+-	{ 0x67D0, &polaris10_vf_device_info },     /* Polaris10 vf*/
+-	{ 0x67DF, &polaris10_device_info },	/* Polaris10 */
+-	{ 0x67E0, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67E1, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67E3, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67E7, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67E8, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67E9, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67EB, &polaris11_device_info },     /* Polaris11 */
+-	{ 0x67EF, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x67FF, &polaris11_device_info },	/* Polaris11 */
+-	{ 0x6860, &vega10_device_info },	/* Vega10 */
+-	{ 0x6861, &vega10_device_info },	/* Vega10 */
+-	{ 0x6862, &vega10_device_info },	/* Vega10 */
+-	{ 0x6863, &vega10_device_info },	/* Vega10 */
+-	{ 0x6864, &vega10_device_info },	/* Vega10 */
+-	{ 0x6867, &vega10_device_info },	/* Vega10 */
+-	{ 0x6868, &vega10_device_info },	/* Vega10 */
+-	{ 0x686C, &vega10_vf_device_info },	/* Vega10  vf*/
+-	{ 0x687F, &vega10_device_info },	/* Vega10 */
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	{ 0x15DD, &raven_device_info }		/* Raven */
+-#endif
++	{ 0x9877, &carrizo_device_info }	/* Carrizo */
+ };
+ 
+ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+ 				unsigned int chunk_size);
+ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
+ 
+-static int kfd_resume(struct kfd_dev *kfd);
+-
+ static const struct kfd_device_info *lookup_device_info(unsigned short did)
+ {
+ 	size_t i;
+@@ -352,17 +122,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+ 		return NULL;
+ 	}
+ 
+-	if (device_info->needs_pci_atomics) {
+-		/* Allow BIF to recode atomics to PCIe 3.0 AtomicOps.
+-		 */
+-		if (pci_enable_atomic_ops_to_root(pdev) < 0) {
+-			dev_info(kfd_device,
+-				"skipped device %x:%x, PCI rejects atomics",
+-				 pdev->vendor, pdev->device);
+-			return NULL;
+-		}
+-	}
+-
+ 	kfd = kzalloc(sizeof(*kfd), GFP_KERNEL);
+ 	if (!kfd)
+ 		return NULL;
+@@ -380,7 +139,6 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+ 	return kfd;
+ }
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
+ {
+ 	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
+@@ -410,9 +168,23 @@ static bool device_iommu_pasid_init(struct kfd_dev *kfd)
+ 	pasid_limit = min_t(unsigned int,
+ 			(unsigned int)(1 << kfd->device_info->max_pasid_bits),
+ 			iommu_info.max_pasids);
++	/*
++	 * last pasid is used for kernel queues doorbells
++	 * in the future the last pasid might be used for a kernel thread.
++	 */
++	pasid_limit = min_t(unsigned int,
++				pasid_limit,
++				kfd->doorbell_process_limit - 1);
++
++	err = amd_iommu_init_device(kfd->pdev, pasid_limit);
++	if (err < 0) {
++		dev_err(kfd_device, "error initializing iommu device\n");
++		return false;
++	}
+ 
+ 	if (!kfd_set_pasid_limit(pasid_limit)) {
+ 		dev_err(kfd_device, "error setting pasid limit\n");
++		amd_iommu_free_device(kfd->pdev);
+ 		return false;
+ 	}
+ 
+@@ -424,7 +196,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
+ 	struct kfd_dev *dev = kfd_device_by_pci_dev(pdev);
+ 
+ 	if (dev)
+-		kfd_process_iommu_unbind_callback(dev, pasid);
++		kfd_unbind_process_from_device(dev, pasid);
+ }
+ 
+ /*
+@@ -451,69 +223,14 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
+ 
+ 	return AMD_IOMMU_INV_PRI_RSP_INVALID;
+ }
+-#endif /* CONFIG_AMD_IOMMU_V2 */
+-
+-static int kfd_cwsr_init(struct kfd_dev *kfd)
+-{
+-	if (cwsr_enable && kfd->device_info->supports_cwsr) {
+-		if (kfd->device_info->asic_family < CHIP_VEGA10) {
+-			BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+-			kfd->cwsr_isa = cwsr_trap_gfx8_hex;
+-			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
+-		} else {
+-			BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
+-			kfd->cwsr_isa = cwsr_trap_gfx9_hex;
+-			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
+-		}
+-
+-		kfd->cwsr_enabled = true;
+-	}
+-
+-	return 0;
+-}
+-
+-static void kfd_ib_mem_init(struct kfd_dev *kdev)
+-{
+-	/* In certain cases we need to send IB from kernel using the GPU address
+-	 * space created by user applications.
+-	 * For example, on GFX v7, we need to flush TC associated to the VMID
+-	 * before tearing down the VMID. In order to do so, we need an address
+-	 * valid to the VMID to place the IB while this space was created on
+-	 * the user's side, not the kernel.
+-	 * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size"
+-	 * but CWSR only uses pages above cwsr_base, we'll use one page memory
+-	 * under cwsr_base for IB submissions
+-	 */
+-	kdev->ib_size = PAGE_SIZE;
+-}
+ 
+ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 			 const struct kgd2kfd_shared_resources *gpu_resources)
+ {
+ 	unsigned int size;
+ 
+-	kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+-			KGD_ENGINE_MEC1);
+-
+ 	kfd->shared_resources = *gpu_resources;
+ 
+-	/* Usually first_vmid_kfd = 8, last_vmid_kfd = 15 */
+-	kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1;
+-	kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1;
+-	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
+-			- kfd->vm_info.first_vmid_kfd + 1;
+-
+-	/* Verify module parameters regarding mapped process number*/
+-	if ((hws_max_conc_proc < 0)
+-			|| (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
+-		dev_err(kfd_device,
+-			"hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
+-			hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
+-			kfd->vm_info.vmid_num_kfd);
+-		kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
+-	} else
+-		kfd->max_proc_per_quantum = hws_max_conc_proc;
+-
+ 	/* calculate max size of mqds needed for queues */
+ 	size = max_num_of_queues_per_device *
+ 			kfd->device_info->mqd_size_aligned;
+@@ -563,31 +280,29 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 		goto kfd_interrupt_error;
+ 	}
+ 
++	if (!device_iommu_pasid_init(kfd)) {
++		dev_err(kfd_device,
++			"Error initializing iommuv2 for device %x:%x\n",
++			kfd->pdev->vendor, kfd->pdev->device);
++		goto device_iommu_pasid_error;
++	}
++	amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
++						iommu_pasid_shutdown_callback);
++	amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
++
+ 	kfd->dqm = device_queue_manager_init(kfd);
+ 	if (!kfd->dqm) {
+ 		dev_err(kfd_device, "Error initializing queue manager\n");
+ 		goto device_queue_manager_error;
+ 	}
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (kfd->device_info->is_need_iommu_device) {
+-		if (!device_iommu_pasid_init(kfd)) {
+-			dev_err(kfd_device, "Error initializing iommuv2\n");
+-			goto device_iommu_pasid_error;
+-		}
+-	}
+-#endif
+-
+-	if (kfd_cwsr_init(kfd)) {
+-		dev_err(kfd_device, "Error initializing cwsr\n");
+-		goto device_iommu_pasid_error;
++	if (kfd->dqm->ops.start(kfd->dqm)) {
++		dev_err(kfd_device,
++			"Error starting queue manager for device %x:%x\n",
++			kfd->pdev->vendor, kfd->pdev->device);
++		goto dqm_start_error;
+ 	}
+ 
+-	kfd_ib_mem_init(kfd);
+-
+-	if (kfd_resume(kfd))
+-		goto kfd_resume_error;
+-
+ 	kfd->dbgmgr = NULL;
+ 
+ 	kfd->init_complete = true;
+@@ -595,14 +310,15 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 		 kfd->pdev->device);
+ 
+ 	pr_debug("Starting kfd with the following scheduling policy %d\n",
+-		kfd->dqm->sched_policy);
++		sched_policy);
+ 
+ 	goto out;
+ 
+-kfd_resume_error:
+-device_iommu_pasid_error:
++dqm_start_error:
+ 	device_queue_manager_uninit(kfd->dqm);
+ device_queue_manager_error:
++	amd_iommu_free_device(kfd->pdev);
++device_iommu_pasid_error:
+ 	kfd_interrupt_exit(kfd);
+ kfd_interrupt_error:
+ 	kfd_topology_remove_device(kfd);
+@@ -622,8 +338,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ void kgd2kfd_device_exit(struct kfd_dev *kfd)
+ {
+ 	if (kfd->init_complete) {
+-		kgd2kfd_suspend(kfd);
+ 		device_queue_manager_uninit(kfd->dqm);
++		amd_iommu_free_device(kfd->pdev);
+ 		kfd_interrupt_exit(kfd);
+ 		kfd_topology_remove_device(kfd);
+ 		kfd_doorbell_fini(kfd);
+@@ -634,385 +350,55 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
+ 	kfree(kfd);
+ }
+ 
+-int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+-{
+-	return 0;
+-}
+-
+-int kgd2kfd_post_reset(struct kfd_dev *kfd)
+-{
+-	return 0;
+-}
+-
+ void kgd2kfd_suspend(struct kfd_dev *kfd)
+ {
+-	if (!kfd->init_complete)
+-		return;
+-
+-	/* For first KFD device suspend all the KFD processes */
+-	if (atomic_inc_return(&kfd_device_suspended) == 1)
+-		kfd_suspend_all_processes();
+-
+-	kfd->dqm->ops.stop(kfd->dqm);
+-
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (!kfd->device_info->is_need_iommu_device)
+-		return;
+-
+-	kfd_unbind_processes_from_device(kfd);
+-
+-	amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+-	amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
+-	amd_iommu_free_device(kfd->pdev);
+-#endif
++	if (kfd->init_complete) {
++		kfd->dqm->ops.stop(kfd->dqm);
++		amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
++		amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
++		amd_iommu_free_device(kfd->pdev);
++	}
+ }
+ 
+ int kgd2kfd_resume(struct kfd_dev *kfd)
+ {
+-	int ret;
+-
+-	if (!kfd->init_complete)
+-		return 0;
+-
+-	ret = kfd_resume(kfd);
+-	if (ret)
+-		return ret;
+-
+-	if (atomic_dec_return(&kfd_device_suspended) == 0)
+-		ret = kfd_resume_all_processes();
+-	WARN(atomic_read(&kfd_device_suspended) < 0,
+-	     "KFD suspend / resume ref. error\n");
+-	return ret;
+-}
+-
+-static int kfd_resume(struct kfd_dev *kfd)
+-{
+-	int err = 0;
++	unsigned int pasid_limit;
++	int err;
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (kfd->device_info->is_need_iommu_device) {
+-		unsigned int pasid_limit = kfd_get_pasid_limit();
++	pasid_limit = kfd_get_pasid_limit();
+ 
++	if (kfd->init_complete) {
+ 		err = amd_iommu_init_device(kfd->pdev, pasid_limit);
+-		if (err) {
++		if (err < 0) {
+ 			dev_err(kfd_device, "failed to initialize iommu\n");
+ 			return -ENXIO;
+ 		}
+ 
+ 		amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
+-				iommu_pasid_shutdown_callback);
+-		amd_iommu_set_invalid_ppr_cb(kfd->pdev,
+-				iommu_invalid_ppr_cb);
+-
+-		err = kfd_bind_processes_to_device(kfd);
+-		if (err) {
+-			dev_err(kfd_device,
+-				"failed to bind process to device\n");
+-			return -ENXIO;
+-		}
++						iommu_pasid_shutdown_callback);
++		amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
++		kfd->dqm->ops.start(kfd->dqm);
+ 	}
+-#endif
+ 
+-	err = kfd->dqm->ops.start(kfd->dqm);
+-	if (err) {
+-		dev_err(kfd_device,
+-			"Error starting queue manager for device %x:%x\n",
+-			kfd->pdev->vendor, kfd->pdev->device);
+-		goto dqm_start_error;
+-	}
+-
+-	return err;
+-
+-dqm_start_error:
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (kfd->device_info->is_need_iommu_device)
+-		amd_iommu_free_device(kfd->pdev);
+-#endif
+-
+-	return err;
++	return 0;
+ }
+ 
+ /* This is called directly from KGD at ISR. */
+ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
+ {
+-	uint32_t patched_ihre[DIV_ROUND_UP(
+-				kfd->device_info->ih_ring_entry_size,
+-				sizeof(uint32_t))];
+-	bool is_patched = false;
+-
+ 	if (!kfd->init_complete)
+ 		return;
+ 
+ 	spin_lock(&kfd->interrupt_lock);
+ 
+-	if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry,
+-						patched_ihre, &is_patched)
+-	    && enqueue_ih_ring_entry(kfd,
+-				is_patched ? patched_ihre : ih_ring_entry))
+-		queue_work(kfd->ih_wq, &kfd->interrupt_work);
++	if (kfd->interrupts_active
++	    && interrupt_is_wanted(kfd, ih_ring_entry)
++	    && enqueue_ih_ring_entry(kfd, ih_ring_entry))
++		schedule_work(&kfd->interrupt_work);
+ 
+ 	spin_unlock(&kfd->interrupt_lock);
+ }
+ 
+-/* quiesce_process_mm -
+- *  Quiesce all user queues that belongs to given process p
+- */
+-int quiesce_process_mm(struct kfd_process *p)
+-{
+-	struct kfd_process_device *pdd;
+-	int r = 0;
+-	unsigned int n_evicted = 0;
+-
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		r = process_evict_queues(pdd->dev->dqm, &pdd->qpd);
+-		if (r != 0) {
+-			pr_err("Failed to evict process queues\n");
+-			goto fail;
+-		}
+-		n_evicted++;
+-	}
+-
+-	return r;
+-
+-fail:
+-	/* To keep state consistent, roll back partial eviction by
+-	 * restoring queues
+-	 */
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		if (n_evicted == 0)
+-			break;
+-		if (process_restore_queues(pdd->dev->dqm, &pdd->qpd))
+-			pr_err("Failed to restore queues\n");
+-
+-		n_evicted--;
+-	}
+-
+-	return r;
+-}
+-
+-/* resume_process_mm -
+- *  Resume all user queues that belongs to given process p. The caller must
+- *  ensure that process p context is valid.
+- */
+-static int resume_process_mm(struct kfd_process *p)
+-{
+-	struct kfd_process_device *pdd;
+-	struct mm_struct *mm = (struct mm_struct *)p->mm;
+-	int r, ret = 0;
+-
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+-			down_read(&mm->mmap_sem);
+-
+-		r = process_restore_queues(pdd->dev->dqm, &pdd->qpd);
+-		if (r != 0) {
+-			pr_err("Failed to restore process queues\n");
+-			if (ret == 0)
+-				ret = r;
+-		}
+-
+-		if (pdd->dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS)
+-			up_read(&mm->mmap_sem);
+-	}
+-
+-	return ret;
+-}
+-
+-int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+-{
+-	struct kfd_process *p;
+-	struct kfd_process_device *pdd;
+-	int r;
+-
+-	/* Because we are called from arbitrary context (workqueue) as opposed
+-	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function increments the process ref count.
+-	 */
+-	p = kfd_lookup_process_by_mm(mm);
+-	if (!p)
+-		return -ENODEV;
+-
+-	if (kfd) {
+-		r = -ENODEV;
+-		pdd = kfd_get_process_device_data(kfd, p);
+-		if (pdd)
+-			r = process_evict_queues(kfd->dqm, &pdd->qpd);
+-	} else {
+-		r = quiesce_process_mm(p);
+-	}
+-
+-	kfd_unref_process(p);
+-	return r;
+-}
+-
+-int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm)
+-{
+-	struct kfd_process *p;
+-	struct kfd_process_device *pdd;
+-	int r;
+-
+-	/* Because we are called from arbitrary context (workqueue) as opposed
+-	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function increments the process ref count.
+-	 */
+-	p = kfd_lookup_process_by_mm(mm);
+-	if (!p)
+-		return -ENODEV;
+-
+-	if (kfd) {
+-		r = -ENODEV;
+-		pdd = kfd_get_process_device_data(kfd, p);
+-		if (pdd)
+-			r = process_restore_queues(kfd->dqm, &pdd->qpd);
+-	} else {
+-		r = resume_process_mm(p);
+-	}
+-
+-	kfd_unref_process(p);
+-	return r;
+-}
+-
+-
+-void kfd_restore_bo_worker(struct work_struct *work)
+-{
+-	struct delayed_work *dwork;
+-	struct kfd_process *p;
+-	struct kfd_process_device *pdd;
+-	int ret = 0;
+-
+-	dwork = to_delayed_work(work);
+-
+-	/* Process termination destroys this worker thread. So during the
+-	 * lifetime of this thread, kfd_process p will be valid
+-	 */
+-	p = container_of(dwork, struct kfd_process, restore_work);
+-
+-	/* Call restore_process_bos on the first KGD device. This function
+-	 * takes care of restoring the whole process including other devices.
+-	 * Restore can fail if enough memory is not available. If so,
+-	 * reschedule again.
+-	 */
+-	pdd = list_first_entry(&p->per_device_data,
+-			       struct kfd_process_device,
+-			       per_device_list);
+-
+-	pr_info("Started restoring process of pasid %d\n", p->pasid);
+-
+-	/* Setting last_restore_timestamp before successful restoration.
+-	 * Otherwise this would have to be set by KGD (restore_process_bos)
+-	 * before KFD BOs are unreserved. If not, the process can be evicted
+-	 * again before the timestamp is set.
+-	 * If restore fails, the timestamp will be set again in the next
+-	 * attempt. This would mean that the minimum GPU quanta would be
+-	 * PROCESS_ACTIVE_TIME_MS - (time to execute the following two
+-	 * functions)
+-	 */
+-
+-	p->last_restore_timestamp = get_jiffies_64();
+-	ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef);
+-	if (ret) {
+-		pr_info("Restore failed, try again after %d ms\n",
+-			PROCESS_BACK_OFF_TIME_MS);
+-		ret = schedule_delayed_work(&p->restore_work,
+-				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
+-		WARN(!ret, "reschedule restore work failed\n");
+-		return;
+-	}
+-
+-	ret = resume_process_mm(p);
+-	if (ret)
+-		pr_err("Failed to resume user queues\n");
+-
+-	pr_info("Finished restoring process of pasid %d\n", p->pasid);
+-}
+-
+-/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will
+- *   prepare for safe eviction of KFD BOs that belong to the specified
+- *   process.
+- *
+- * @mm: mm_struct that identifies the specified KFD process
+- * @fence: eviction fence attached to KFD process BOs
+- *
+- */
+-int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+-					       struct dma_fence *fence)
+-{
+-	struct kfd_process *p;
+-	unsigned long active_time;
+-	unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS);
+-
+-	if (!fence)
+-		return -EINVAL;
+-
+-	if (dma_fence_is_signaled(fence))
+-		return 0;
+-
+-	p = kfd_lookup_process_by_mm(mm);
+-	if (!p)
+-		return -ENODEV;
+-
+-	if (fence->seqno == p->last_eviction_seqno)
+-		goto out;
+-
+-	p->last_eviction_seqno = fence->seqno;
+-
+-	/* Avoid KFD process starvation. Wait for at least
+-	 * PROCESS_ACTIVE_TIME_MS before evicting the process again
+-	 */
+-	active_time = get_jiffies_64() - p->last_restore_timestamp;
+-	if (delay_jiffies > active_time)
+-		delay_jiffies -= active_time;
+-	else
+-		delay_jiffies = 0;
+-
+-	/* During process initialization eviction_work.dwork is initialized
+-	 * to kfd_evict_bo_worker
+-	 */
+-	schedule_delayed_work(&p->eviction_work, delay_jiffies);
+-out:
+-	kfd_unref_process(p);
+-	return 0;
+-}
+-
+-void kfd_evict_bo_worker(struct work_struct *work)
+-{
+-	int ret;
+-	struct kfd_process *p;
+-	struct delayed_work *dwork;
+-
+-	dwork = to_delayed_work(work);
+-
+-	/* Process termination destroys this worker thread. So during the
+-	 * lifetime of this thread, kfd_process p will be valid
+-	 */
+-	p = container_of(dwork, struct kfd_process, eviction_work);
+-	WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
+-		  "Eviction fence mismatch\n");
+-
+-	/* Narrow window of overlap between restore and evict work
+-	 * item is possible. Once
+-	 * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs,
+-	 * it is possible to evicted again. But restore has few more
+-	 * steps of finish. So lets wait for any previous restore work
+-	 * to complete
+-	 */
+-	flush_delayed_work(&p->restore_work);
+-
+-	pr_info("Started evicting process of pasid %d\n", p->pasid);
+-	ret = quiesce_process_mm(p);
+-	if (!ret) {
+-		dma_fence_signal(p->ef);
+-		dma_fence_put(p->ef);
+-		p->ef = NULL;
+-		schedule_delayed_work(&p->restore_work,
+-				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+-
+-		pr_info("Finished evicting process of pasid %d\n", p->pasid);
+-	} else
+-		pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n",
+-		       p->pasid);
+-}
+-
+ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
+ 				unsigned int chunk_size)
+ {
+@@ -1076,8 +462,8 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
+ 	if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
+ 		return -ENOMEM;
+ 
+-	*mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+-	if (!(*mem_obj))
++	*mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
++	if ((*mem_obj) == NULL)
+ 		return -ENOMEM;
+ 
+ 	pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index a628a0d..53a66e8 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -44,14 +44,9 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct queue *q,
+ 					struct qcm_process_device *qpd);
+ 
+-static int execute_queues_cpsch(struct device_queue_manager *dqm,
+-				enum kfd_unmap_queues_filter filter,
+-				uint32_t filter_param);
+-static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+-				enum kfd_unmap_queues_filter filter,
+-				uint32_t filter_param);
+-
+-static int map_queues_cpsch(struct device_queue_manager *dqm);
++static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock);
++static int destroy_queues_cpsch(struct device_queue_manager *dqm,
++				bool preempt_static_queues, bool lock);
+ 
+ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 					struct queue *q,
+@@ -98,17 +93,6 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm)
+ 	return dqm->dev->shared_resources.num_pipe_per_mec;
+ }
+ 
+-static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
+-{
+-	return dqm->dev->device_info->num_sdma_engines;
+-}
+-
+-unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
+-{
+-	return dqm->dev->device_info->num_sdma_engines
+-			* KFD_SDMA_QUEUES_PER_ENGINE;
+-}
+-
+ void program_sh_mem_settings(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+@@ -120,57 +104,6 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
+ 						qpd->sh_mem_bases);
+ }
+ 
+-static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
+-{
+-	struct kfd_dev *dev = qpd->dqm->dev;
+-
+-	if (!KFD_IS_SOC15(dev->device_info->asic_family)) {
+-		/* On pre-SOC15 chips we need to use the queue ID to
+-		 * preserve the user mode ABI.
+-		 */
+-		q->doorbell_id = q->properties.queue_id;
+-	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+-		/* For SDMA queues on SOC15, use static doorbell
+-		 * assignments based on the engine and queue.
+-		 */
+-		q->doorbell_id = dev->shared_resources.sdma_doorbell
+-			[q->properties.sdma_engine_id]
+-			[q->properties.sdma_queue_id];
+-	} else {
+-		/* For CP queues on SOC15 reserve a free doorbell ID */
+-		unsigned int found;
+-
+-		found = find_first_zero_bit(qpd->doorbell_bitmap,
+-					    KFD_MAX_NUM_OF_QUEUES_PER_PROCESS);
+-		if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
+-			pr_debug("No doorbells available");
+-			return -EBUSY;
+-		}
+-		set_bit(found, qpd->doorbell_bitmap);
+-		q->doorbell_id = found;
+-	}
+-
+-	q->properties.doorbell_off =
+-		kfd_doorbell_id_to_offset(dev, q->process,
+-					  q->doorbell_id);
+-
+-	return 0;
+-}
+-
+-static void deallocate_doorbell(struct qcm_process_device *qpd,
+-				struct queue *q)
+-{
+-	unsigned int old;
+-	struct kfd_dev *dev = qpd->dqm->dev;
+-
+-	if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
+-	    q->properties.type == KFD_QUEUE_TYPE_SDMA)
+-		return;
+-
+-	old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
+-	WARN_ON(!old);
+-}
+-
+ static int allocate_vmid(struct device_queue_manager *dqm,
+ 			struct qcm_process_device *qpd,
+ 			struct queue *q)
+@@ -180,10 +113,11 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	if (dqm->vmid_bitmap == 0)
+ 		return -ENOMEM;
+ 
+-	bit = ffs(dqm->vmid_bitmap) - 1;
+-	dqm->vmid_bitmap &= ~(1 << bit);
++	bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, CIK_VMID_NUM);
++	clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
+ 
+-	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
++	/* Kaveri kfd vmid's starts from vmid 8 */
++	allocated_vmid = bit + KFD_VMID_START_OFFSET;
+ 	pr_debug("vmid allocation %d\n", allocated_vmid);
+ 	qpd->vmid = allocated_vmid;
+ 	q->properties.vmid = allocated_vmid;
+@@ -191,57 +125,27 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
+ 	program_sh_mem_settings(dqm, qpd);
+ 
+-	/* qpd->page_table_base is set earlier when register_process()
+-	 * is called, i.e. when the first queue is created.
+-	 */
+-	dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd,
+-			qpd->vmid,
+-			qpd->page_table_base);
+-	/*invalidate the VM context after pasid and vmid mapping is set up*/
+-	kfd_flush_tlb(dqm->dev, qpd->pqm->process);
+-
+ 	return 0;
+ }
+ 
+-static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
+-				struct qcm_process_device *qpd)
+-{
+-	uint32_t len;
+-
+-	if (!qpd->ib_kaddr)
+-		return -ENOMEM;
+-
+-	len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base,
+-						 (uint32_t *)qpd->ib_kaddr);
+-
+-	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
+-				qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
+-}
+-
+ static void deallocate_vmid(struct device_queue_manager *dqm,
+ 				struct qcm_process_device *qpd,
+ 				struct queue *q)
+ {
+-	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
+-
+-	/* On GFX v7, CP doesn't flush TC at dequeue */
+-	if (q->device->device_info->asic_family == CHIP_HAWAII)
+-		if (flush_texture_cache_nocpsch(q->device, qpd))
+-			pr_err("Failed to flush TC\n");
+-
+-	kfd_flush_tlb(dqm->dev, qpd->pqm->process);
++	int bit = qpd->vmid - KFD_VMID_START_OFFSET;
+ 
+ 	/* Release the vmid mapping */
+ 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+ 
+-	dqm->vmid_bitmap |= (1 << bit);
++	set_bit(bit, (unsigned long *)&dqm->vmid_bitmap);
+ 	qpd->vmid = 0;
+ 	q->properties.vmid = 0;
+ }
+ 
+ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ 				struct queue *q,
+-				struct qcm_process_device *qpd)
++				struct qcm_process_device *qpd,
++				int *allocated_vmid)
+ {
+ 	int retval;
+ 
+@@ -261,18 +165,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ 		if (retval)
+ 			goto out_unlock;
+ 	}
++	*allocated_vmid = qpd->vmid;
+ 	q->properties.vmid = qpd->vmid;
+-	/*
+-	 * Eviction state logic: we only mark active queues as evicted
+-	 * to avoid the overhead of restoring inactive queues later
+-	 */
+-	if (qpd->evicted)
+-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+-					    q->properties.queue_percent > 0 &&
+-					    q->properties.queue_address != 0);
+-
+-	q->properties.tba_addr = qpd->tba_addr;
+-	q->properties.tma_addr = qpd->tma_addr;
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+ 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
+@@ -282,14 +176,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ 		retval = -EINVAL;
+ 
+ 	if (retval) {
+-		if (list_empty(&qpd->queues_list))
++		if (list_empty(&qpd->queues_list)) {
+ 			deallocate_vmid(dqm, qpd, q);
+-
++			*allocated_vmid = 0;
++		}
+ 		goto out_unlock;
+ 	}
+ 
+ 	list_add(&q->list, &qpd->queues_list);
+-	qpd->queue_count++;
+ 	if (q->properties.is_active)
+ 		dqm->queue_count++;
+ 
+@@ -324,8 +218,12 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q)
+ 			continue;
+ 
+ 		if (dqm->allocated_queues[pipe] != 0) {
+-			bit = ffs(dqm->allocated_queues[pipe]) - 1;
+-			dqm->allocated_queues[pipe] &= ~(1 << bit);
++			bit = find_first_bit(
++				(unsigned long *)&dqm->allocated_queues[pipe],
++				get_queues_per_pipe(dqm));
++
++			clear_bit(bit,
++				(unsigned long *)&dqm->allocated_queues[pipe]);
+ 			q->pipe = pipe;
+ 			q->queue = bit;
+ 			set = true;
+@@ -346,7 +244,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q)
+ static inline void deallocate_hqd(struct device_queue_manager *dqm,
+ 				struct queue *q)
+ {
+-	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
++	set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]);
+ }
+ 
+ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+@@ -364,24 +262,17 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (retval)
+ 		return retval;
+ 
+-	retval = allocate_doorbell(qpd, q);
+-	if (retval)
+-		goto out_deallocate_hqd;
+-
+ 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+-		goto out_deallocate_doorbell;
++		goto out_deallocate_hqd;
+ 
+ 	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+ 			q->pipe, q->queue);
+ 
+-	dqm->dev->kfd2kgd->alloc_memory_of_scratch(
++	dqm->dev->kfd2kgd->set_scratch_backing_va(
+ 			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
+ 
+-	if (!q->properties.is_active)
+-		return 0;
+-
+ 	retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties,
+ 			       q->process->mm);
+ 	if (retval)
+@@ -391,84 +282,71 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
+ 
+ out_uninit_mqd:
+ 	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+-out_deallocate_doorbell:
+-	deallocate_doorbell(qpd, q);
+ out_deallocate_hqd:
+ 	deallocate_hqd(dqm, q);
+ 
+ 	return retval;
+ }
+ 
+-/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
+- * to avoid asynchronized access
+- */
+-static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
++static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+ 				struct qcm_process_device *qpd,
+ 				struct queue *q)
+ {
+ 	int retval;
+ 	struct mqd_manager *mqd;
+ 
+-	mqd = dqm->ops.get_mqd_manager(dqm,
+-		get_mqd_type_from_queue_type(q->properties.type));
+-	if (!mqd)
+-		return -ENOMEM;
++	retval = 0;
++
++	mutex_lock(&dqm->lock);
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
++		mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
++		if (mqd == NULL) {
++			retval = -ENOMEM;
++			goto out;
++		}
+ 		deallocate_hqd(dqm, q);
+ 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
++		mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
++		if (mqd == NULL) {
++			retval = -ENOMEM;
++			goto out;
++		}
+ 		dqm->sdma_queue_count--;
+ 		deallocate_sdma_queue(dqm, q->sdma_id);
+ 	} else {
+ 		pr_debug("q->properties.type %d is invalid\n",
+ 				q->properties.type);
+-		return -EINVAL;
++		retval = -EINVAL;
++		goto out;
+ 	}
+-	dqm->total_queue_count--;
+-
+-	deallocate_doorbell(qpd, q);
+ 
+ 	retval = mqd->destroy_mqd(mqd, q->mqd,
+ 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+-				KFD_UNMAP_LATENCY_MS,
++				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+ 				q->pipe, q->queue);
+-	if (retval == -ETIME)
+-		qpd->reset_wavefronts = true;
++
++	if (retval)
++		goto out;
+ 
+ 	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 
+ 	list_del(&q->list);
+-	if (list_empty(&qpd->queues_list)) {
+-		if (qpd->reset_wavefronts) {
+-			pr_warn("Resetting wave fronts (nocpsch) on dev %p\n",
+-					dqm->dev);
+-			/* dbgdev_wave_reset_wavefronts has to be called before
+-			 * deallocate_vmid(), i.e. when vmid is still in use.
+-			 */
+-			dbgdev_wave_reset_wavefronts(dqm->dev,
+-					qpd->pqm->process);
+-			qpd->reset_wavefronts = false;
+-		}
+-
++	if (list_empty(&qpd->queues_list))
+ 		deallocate_vmid(dqm, qpd, q);
+-	}
+-	qpd->queue_count--;
+ 	if (q->properties.is_active)
+ 		dqm->queue_count--;
+ 
+-	return retval;
+-}
+-
+-static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
+-				struct qcm_process_device *qpd,
+-				struct queue *q)
+-{
+-	int retval;
++	/*
++	 * Unconditionally decrement this counter, regardless of the queue's
++	 * type
++	 */
++	dqm->total_queue_count--;
++	pr_debug("Total of %d queues are accountable so far\n",
++			dqm->total_queue_count);
+ 
+-	mutex_lock(&dqm->lock);
+-	retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
++out:
+ 	mutex_unlock(&dqm->lock);
+-
+ 	return retval;
+ }
+ 
+@@ -476,82 +354,39 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ {
+ 	int retval;
+ 	struct mqd_manager *mqd;
+-	struct kfd_process_device *pdd;
+-
+ 	bool prev_active = false;
+ 
+ 	mutex_lock(&dqm->lock);
+-
+-	pdd = kfd_get_process_device_data(q->device, q->process);
+-	if (!pdd) {
+-		retval = -ENODEV;
+-		goto out_unlock;
+-	}
+ 	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+ 	if (!mqd) {
+ 		retval = -ENOMEM;
+ 		goto out_unlock;
+ 	}
+-	/*
+-	 * Eviction state logic: we only mark active queues as evicted
+-	 * to avoid the overhead of restoring inactive queues later
+-	 */
+-	if (pdd->qpd.evicted > 0)
+-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+-					    q->properties.queue_percent > 0 &&
+-					    q->properties.queue_address != 0);
+-
+-	/* Save previous activity state for counters */
+-	prev_active = q->properties.is_active;
+-
+-	/* Make sure the queue is unmapped before updating the MQD */
+-	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
+-		retval = unmap_queues_cpsch(dqm,
+-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+-		if (retval) {
+-			pr_err("unmap queue failed\n");
+-			goto out_unlock;
+-		}
+-	} else if (prev_active &&
+-		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+-		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+-		retval = mqd->destroy_mqd(mqd, q->mqd,
+-				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+-				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+-		if (retval) {
+-			pr_err("destroy mqd failed\n");
+-			goto out_unlock;
+-		}
+-	}
+ 
+-	retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
++	if (q->properties.is_active)
++		prev_active = true;
+ 
+ 	/*
+-	 * check active state vs. the previous state and modify
+-	 * counter accordingly. map_queues_cpsch uses the
+-	 * dqm->queue_count to determine whether a new runlist must be
+-	 * uploaded.
++	 *
++	 * check active state vs. the previous state
++	 * and modify counter accordingly
+ 	 */
+-	if (q->properties.is_active && !prev_active)
++	retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
++	if ((q->properties.is_active) && (!prev_active))
+ 		dqm->queue_count++;
+ 	else if (!q->properties.is_active && prev_active)
+ 		dqm->queue_count--;
+ 
+-	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
+-		retval = map_queues_cpsch(dqm);
+-	else if (q->properties.is_active &&
+-		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+-		  q->properties.type == KFD_QUEUE_TYPE_SDMA))
+-		retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue,
+-				       &q->properties, q->process->mm);
++	if (sched_policy != KFD_SCHED_POLICY_NO_HWS)
++		retval = execute_queues_cpsch(dqm, false);
+ 
+ out_unlock:
+ 	mutex_unlock(&dqm->lock);
+ 	return retval;
+ }
+ 
+-static struct mqd_manager *get_mqd_manager(
++static struct mqd_manager *get_mqd_manager_nocpsch(
+ 		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
+ {
+ 	struct mqd_manager *mqd;
+@@ -572,140 +407,11 @@ static struct mqd_manager *get_mqd_manager(
+ 	return mqd;
+ }
+ 
+-int process_evict_queues(struct device_queue_manager *dqm,
+-			 struct qcm_process_device *qpd)
+-{
+-	struct queue *q, *next;
+-	struct mqd_manager *mqd;
+-	struct kfd_process_device *pdd;
+-	int retval = 0;
+-
+-	mutex_lock(&dqm->lock);
+-	if (qpd->evicted++ > 0) /* already evicted, do nothing */
+-		goto out;
+-
+-	pdd = qpd_to_pdd(qpd);
+-	pr_info_ratelimited("Evicting PASID %u queues\n",
+-			    pdd->process->pasid);
+-
+-	/* unactivate all active queues on the qpd */
+-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+-		mqd = dqm->ops.get_mqd_manager(dqm,
+-			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd) { /* should not be here */
+-			pr_err("Cannot evict queue, mqd is NULL\n");
+-			retval = -ENOMEM;
+-			goto out;
+-		}
+-		/* if the queue is not active anyway, it is not evicted */
+-		if (q->properties.is_active) {
+-			q->properties.is_evicted = true;
+-			q->properties.is_active = false;
+-		}
+-
+-		if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+-		    q->properties.is_evicted &&
+-		    (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+-		     q->properties.type == KFD_QUEUE_TYPE_SDMA))
+-			retval = mqd->destroy_mqd(mqd, q->mqd,
+-				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+-				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+-		if (q->properties.is_evicted)
+-			dqm->queue_count--;
+-	}
+-	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
+-		retval = execute_queues_cpsch(dqm,
+-				qpd->is_debug ?
+-				KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
+-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+-
+-out:
+-	mutex_unlock(&dqm->lock);
+-	return retval;
+-
+-}
+-
+-int process_restore_queues(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd)
+-{
+-	struct queue *q, *next;
+-	struct mqd_manager *mqd;
+-	int retval = 0;
+-	struct kfd_process_device *pdd;
+-	uint32_t pd_base;
+-
+-	pdd = qpd_to_pdd(qpd);
+-	/* Retrieve PD base */
+-	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+-
+-	mutex_lock(&dqm->lock);
+-	if (qpd->evicted == 0) /* already restored, do nothing */
+-		goto out_unlock;
+-
+-	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
+-		qpd->evicted--;
+-		goto out_unlock;
+-	}
+-
+-	pr_info_ratelimited("Restoring PASID %u queues\n",
+-			    pdd->process->pasid);
+-
+-	/* Update PD Base in QPD */
+-	qpd->page_table_base = pd_base;
+-	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+-
+-	if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+-	    !list_empty(&qpd->queues_list)) {
+-		dqm->dev->kfd2kgd->set_vm_context_page_table_base(
+-				dqm->dev->kgd,
+-				qpd->vmid,
+-				qpd->page_table_base);
+-
+-		kfd_flush_tlb(dqm->dev, pdd->process);
+-	}
+-
+-	/* activate all active queues on the qpd */
+-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+-		mqd = dqm->ops.get_mqd_manager(dqm,
+-			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd) { /* should not be here */
+-			pr_err("Cannot restore queue, mqd is NULL\n");
+-			retval = -ENOMEM;
+-			goto out_unlock;
+-		}
+-		if (q->properties.is_evicted) {
+-			q->properties.is_evicted = false;
+-			q->properties.is_active = true;
+-
+-			if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
+-			    (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+-			     q->properties.type == KFD_QUEUE_TYPE_SDMA))
+-				retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+-						       q->queue, &q->properties,
+-						       q->process->mm);
+-			dqm->queue_count++;
+-		}
+-	}
+-	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
+-		retval = execute_queues_cpsch(dqm,
+-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+-
+-	if (retval == 0)
+-		qpd->evicted = 0;
+-
+-out_unlock:
+-	mutex_unlock(&dqm->lock);
+-
+-	return retval;
+-}
+-
+-static int register_process(struct device_queue_manager *dqm,
++static int register_process_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	struct device_process_node *n;
+ 	int retval;
+-	struct kfd_process_device *pdd;
+-	uint32_t pd_base;
+ 
+ 	n = kzalloc(sizeof(*n), GFP_KERNEL);
+ 	if (!n)
+@@ -713,18 +419,10 @@ static int register_process(struct device_queue_manager *dqm,
+ 
+ 	n->qpd = qpd;
+ 
+-	pdd = qpd_to_pdd(qpd);
+-	/* Retrieve PD base */
+-	pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
+-
+ 	mutex_lock(&dqm->lock);
+ 	list_add(&n->list, &dqm->queues);
+ 
+-	/* Update PD Base in QPD */
+-	qpd->page_table_base = pd_base;
+-	pr_debug("Updated PD address to 0x%08x\n", pd_base);
+-
+-	retval = dqm->asic_ops.update_qpd(dqm, qpd);
++	retval = dqm->ops_asic_specific.register_process(dqm, qpd);
+ 
+ 	dqm->processes_count++;
+ 
+@@ -733,7 +431,7 @@ static int register_process(struct device_queue_manager *dqm,
+ 	return retval;
+ }
+ 
+-static int unregister_process(struct device_queue_manager *dqm,
++static int unregister_process_nocpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	int retval;
+@@ -809,13 +507,13 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
+ 				dqm->allocated_queues[pipe] |= 1 << queue;
+ 	}
+ 
+-	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
+-	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
++	dqm->vmid_bitmap = (1 << VMID_PER_DEVICE) - 1;
++	dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ 
+ 	return 0;
+ }
+ 
+-static void uninitialize(struct device_queue_manager *dqm)
++static void uninitialize_nocpsch(struct device_queue_manager *dqm)
+ {
+ 	int i;
+ 
+@@ -831,12 +529,11 @@ static void uninitialize(struct device_queue_manager *dqm)
+ static int start_nocpsch(struct device_queue_manager *dqm)
+ {
+ 	init_interrupts(dqm);
+-	return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
++	return 0;
+ }
+ 
+ static int stop_nocpsch(struct device_queue_manager *dqm)
+ {
+-	pm_uninit(&dqm->packets);
+ 	return 0;
+ }
+ 
+@@ -848,8 +545,10 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
+ 	if (dqm->sdma_bitmap == 0)
+ 		return -ENOMEM;
+ 
+-	bit = ffs(dqm->sdma_bitmap) - 1;
+-	dqm->sdma_bitmap &= ~(1 << bit);
++	bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap,
++				CIK_SDMA_QUEUES);
++
++	clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap);
+ 	*sdma_queue_id = bit;
+ 
+ 	return 0;
+@@ -858,9 +557,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
+ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
+ 				unsigned int sdma_queue_id)
+ {
+-	if (sdma_queue_id >= get_num_sdma_queues(dqm))
++	if (sdma_queue_id >= CIK_SDMA_QUEUES)
+ 		return;
+-	dqm->sdma_bitmap |= (1 << sdma_queue_id);
++	set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap);
+ }
+ 
+ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+@@ -878,22 +577,18 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 	if (retval)
+ 		return retval;
+ 
+-	q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
+-	q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
+-
+-	retval = allocate_doorbell(qpd, q);
+-	if (retval)
+-		goto out_deallocate_sdma_queue;
++	q->properties.sdma_queue_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
++	q->properties.sdma_engine_id = q->sdma_id / CIK_SDMA_ENGINE_NUM;
+ 
+ 	pr_debug("SDMA id is:    %d\n", q->sdma_id);
+ 	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
+ 	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+ 
+-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
++	dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
+ 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+-		goto out_deallocate_doorbell;
++		goto out_deallocate_sdma_queue;
+ 
+ 	retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL);
+ 	if (retval)
+@@ -903,8 +598,6 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
+ 
+ out_uninit_mqd:
+ 	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+-out_deallocate_doorbell:
+-	deallocate_doorbell(qpd, q);
+ out_deallocate_sdma_queue:
+ 	deallocate_sdma_queue(dqm, q->sdma_id);
+ 
+@@ -920,7 +613,8 @@ static int set_sched_resources(struct device_queue_manager *dqm)
+ 	int i, mec;
+ 	struct scheduling_resources res;
+ 
+-	res.vmid_mask = dqm->dev->shared_resources.compute_vmid_bitmap;
++	res.vmid_mask = (1 << VMID_PER_DEVICE) - 1;
++	res.vmid_mask <<= KFD_VMID_START_OFFSET;
+ 
+ 	res.queue_mask = 0;
+ 	for (i = 0; i < KGD_MAX_QUEUES; ++i) {
+@@ -958,6 +652,8 @@ static int set_sched_resources(struct device_queue_manager *dqm)
+ 
+ static int initialize_cpsch(struct device_queue_manager *dqm)
+ {
++	int retval;
++
+ 	pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
+ 
+ 	mutex_init(&dqm->lock);
+@@ -965,18 +661,21 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
+ 	dqm->queue_count = dqm->processes_count = 0;
+ 	dqm->sdma_queue_count = 0;
+ 	dqm->active_runlist = false;
+-	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
++	retval = dqm->ops_asic_specific.initialize(dqm);
++	if (retval)
++		mutex_destroy(&dqm->lock);
+ 
+-	return 0;
++	return retval;
+ }
+ 
+ static int start_cpsch(struct device_queue_manager *dqm)
+ {
++	struct device_process_node *node;
+ 	int retval;
+ 
+ 	retval = 0;
+ 
+-	retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version);
++	retval = pm_init(&dqm->packets, dqm);
+ 	if (retval)
+ 		goto fail_packet_manager_init;
+ 
+@@ -998,9 +697,12 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 
+ 	init_interrupts(dqm);
+ 
+-	mutex_lock(&dqm->lock);
+-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+-	mutex_unlock(&dqm->lock);
++	list_for_each_entry(node, &dqm->queues, list)
++		if (node->qpd->pqm->process && dqm->dev)
++			kfd_bind_process_to_device(dqm->dev,
++						node->qpd->pqm->process);
++
++	execute_queues_cpsch(dqm, true);
+ 
+ 	return 0;
+ fail_allocate_vidmem:
+@@ -1012,12 +714,15 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ 
+ static int stop_cpsch(struct device_queue_manager *dqm)
+ {
+-	mutex_lock(&dqm->lock);
+-
+-	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
++	struct device_process_node *node;
++	struct kfd_process_device *pdd;
+ 
+-	mutex_unlock(&dqm->lock);
++	destroy_queues_cpsch(dqm, true, true);
+ 
++	list_for_each_entry(node, &dqm->queues, list) {
++		pdd = qpd_to_pdd(node->qpd);
++		pdd->bound = false;
++	}
+ 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+ 	pm_uninit(&dqm->packets);
+ 
+@@ -1047,7 +752,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 	list_add(&kq->list, &qpd->priv_queue_list);
+ 	dqm->queue_count++;
+ 	qpd->is_debug = true;
+-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
++	execute_queues_cpsch(dqm, false);
+ 	mutex_unlock(&dqm->lock);
+ 
+ 	return 0;
+@@ -1058,10 +763,12 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	mutex_lock(&dqm->lock);
++	/* here we actually preempt the DIQ */
++	destroy_queues_cpsch(dqm, true, false);
+ 	list_del(&kq->list);
+ 	dqm->queue_count--;
+ 	qpd->is_debug = false;
+-	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
++	execute_queues_cpsch(dqm, false);
+ 	/*
+ 	 * Unconditionally decrement this counter, regardless of the queue's
+ 	 * type.
+@@ -1072,68 +779,55 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
+ 	mutex_unlock(&dqm->lock);
+ }
+ 
++static void select_sdma_engine_id(struct queue *q)
++{
++	static int sdma_id;
++
++	q->sdma_id = sdma_id;
++	sdma_id = (sdma_id + 1) % 2;
++}
++
+ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+-			struct qcm_process_device *qpd)
++			struct qcm_process_device *qpd, int *allocate_vmid)
+ {
+ 	int retval;
+ 	struct mqd_manager *mqd;
+ 
+ 	retval = 0;
+ 
++	if (allocate_vmid)
++		*allocate_vmid = 0;
++
+ 	mutex_lock(&dqm->lock);
+ 
+ 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
+ 		pr_warn("Can't create new usermode queue because %d queues were already created\n",
+ 				dqm->total_queue_count);
+ 		retval = -EPERM;
+-		goto out_unlock;
+-	}
+-
+-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+-		retval = allocate_sdma_queue(dqm, &q->sdma_id);
+-		if (retval)
+-			goto out_unlock;
+-		q->properties.sdma_queue_id =
+-			q->sdma_id / get_num_sdma_engines(dqm);
+-		q->properties.sdma_engine_id =
+-			q->sdma_id % get_num_sdma_engines(dqm);
++		goto out;
+ 	}
+ 
+-	retval = allocate_doorbell(qpd, q);
+-	if (retval)
+-		goto out_deallocate_sdma_queue;
++	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
++		select_sdma_engine_id(q);
+ 
+ 	mqd = dqm->ops.get_mqd_manager(dqm,
+ 			get_mqd_type_from_queue_type(q->properties.type));
+ 
+ 	if (!mqd) {
+ 		retval = -ENOMEM;
+-		goto out_deallocate_doorbell;
++		goto out;
+ 	}
+-	/*
+-	 * Eviction state logic: we only mark active queues as evicted
+-	 * to avoid the overhead of restoring inactive queues later
+-	 */
+-	if (qpd->evicted)
+-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
+-					    q->properties.queue_percent > 0 &&
+-					    q->properties.queue_address != 0);
+ 
+-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+-
+-	q->properties.tba_addr = qpd->tba_addr;
+-	q->properties.tma_addr = qpd->tma_addr;
++	dqm->ops_asic_specific.init_sdma_vm(dqm, q, qpd);
+ 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ 				&q->gart_mqd_addr, &q->properties);
+ 	if (retval)
+-		goto out_deallocate_doorbell;
++		goto out;
+ 
+ 	list_add(&q->list, &qpd->queues_list);
+-	qpd->queue_count++;
+ 	if (q->properties.is_active) {
+ 		dqm->queue_count++;
+-		retval = execute_queues_cpsch(dqm,
+-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
++		retval = execute_queues_cpsch(dqm, false);
+ 	}
+ 
+ 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+@@ -1147,28 +841,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
+ 	pr_debug("Total of %d queues are accountable so far\n",
+ 			dqm->total_queue_count);
+ 
++out:
+ 	mutex_unlock(&dqm->lock);
+ 	return retval;
+-
+-out_deallocate_doorbell:
+-	deallocate_doorbell(qpd, q);
+-out_deallocate_sdma_queue:
+-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+-		deallocate_sdma_queue(dqm, q->sdma_id);
+-out_unlock:
+-	mutex_unlock(&dqm->lock);
+-
+-	return retval;
+ }
+ 
+ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 				unsigned int fence_value,
+-				unsigned int timeout_ms)
++				unsigned long timeout)
+ {
+-	unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies;
++	timeout += jiffies;
+ 
+ 	while (*fence_addr != fence_value) {
+-		if (time_after(jiffies, end_jiffies)) {
++		if (time_after(jiffies, timeout)) {
+ 			pr_err("qcm fence wait loop timeout expired\n");
+ 			return -ETIME;
+ 		}
+@@ -1178,57 +863,44 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 	return 0;
+ }
+ 
+-static int unmap_sdma_queues(struct device_queue_manager *dqm,
++static int destroy_sdma_queues(struct device_queue_manager *dqm,
+ 				unsigned int sdma_engine)
+ {
+ 	return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
+-			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
++			KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES, 0, false,
+ 			sdma_engine);
+ }
+ 
+-/* dqm->lock mutex has to be locked before calling this function */
+-static int map_queues_cpsch(struct device_queue_manager *dqm)
++static int destroy_queues_cpsch(struct device_queue_manager *dqm,
++				bool preempt_static_queues, bool lock)
+ {
+ 	int retval;
++	enum kfd_preempt_type_filter preempt_type;
++	struct kfd_process_device *pdd;
+ 
+-	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
+-		return 0;
+-
+-	if (dqm->active_runlist)
+-		return 0;
+-
+-	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
+-	if (retval) {
+-		pr_err("failed to execute runlist\n");
+-		return retval;
+-	}
+-	dqm->active_runlist = true;
+-
+-	return retval;
+-}
+-
+-/* dqm->lock mutex has to be locked before calling this function */
+-static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+-				enum kfd_unmap_queues_filter filter,
+-				uint32_t filter_param)
+-{
+-	int retval = 0;
++	retval = 0;
+ 
++	if (lock)
++		mutex_lock(&dqm->lock);
+ 	if (!dqm->active_runlist)
+-		return retval;
++		goto out;
+ 
+ 	pr_debug("Before destroying queues, sdma queue count is : %u\n",
+ 		dqm->sdma_queue_count);
+ 
+ 	if (dqm->sdma_queue_count > 0) {
+-		unmap_sdma_queues(dqm, 0);
+-		unmap_sdma_queues(dqm, 1);
++		destroy_sdma_queues(dqm, 0);
++		destroy_sdma_queues(dqm, 1);
+ 	}
+ 
++	preempt_type = preempt_static_queues ?
++			KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES :
++			KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES;
++
+ 	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
+-			filter, filter_param, false, 0);
++			preempt_type, 0, false, 0);
+ 	if (retval)
+-		return retval;
++		goto out;
+ 
+ 	*dqm->fence_addr = KFD_FENCE_INIT;
+ 	pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
+@@ -1236,29 +908,55 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+ 	/* should be timed out */
+ 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
+ 				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+-	if (retval)
+-		return retval;
+-
++	if (retval) {
++		pdd = kfd_get_process_device_data(dqm->dev,
++				kfd_get_process(current));
++		pdd->reset_wavefronts = true;
++		goto out;
++	}
+ 	pm_release_ib(&dqm->packets);
+ 	dqm->active_runlist = false;
+ 
++out:
++	if (lock)
++		mutex_unlock(&dqm->lock);
+ 	return retval;
+ }
+ 
+-/* dqm->lock mutex has to be locked before calling this function */
+-static int execute_queues_cpsch(struct device_queue_manager *dqm,
+-				enum kfd_unmap_queues_filter filter,
+-				uint32_t filter_param)
++static int execute_queues_cpsch(struct device_queue_manager *dqm, bool lock)
+ {
+ 	int retval;
+ 
+-	retval = unmap_queues_cpsch(dqm, filter, filter_param);
++	if (lock)
++		mutex_lock(&dqm->lock);
++
++	retval = destroy_queues_cpsch(dqm, false, false);
+ 	if (retval) {
+-		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+-		return retval;
++		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption");
++		goto out;
++	}
++
++	if (dqm->queue_count <= 0 || dqm->processes_count <= 0) {
++		retval = 0;
++		goto out;
++	}
++
++	if (dqm->active_runlist) {
++		retval = 0;
++		goto out;
+ 	}
+ 
+-	return map_queues_cpsch(dqm);
++	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
++	if (retval) {
++		pr_err("failed to execute runlist");
++		goto out;
++	}
++	dqm->active_runlist = true;
++
++out:
++	if (lock)
++		mutex_unlock(&dqm->lock);
++	return retval;
+ }
+ 
+ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+@@ -1293,22 +991,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 		goto failed;
+ 	}
+ 
+-	deallocate_doorbell(qpd, q);
+-
+-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
++	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
+ 		dqm->sdma_queue_count--;
+-		deallocate_sdma_queue(dqm, q->sdma_id);
+-	}
+ 
+ 	list_del(&q->list);
+-	qpd->queue_count--;
+-	if (q->properties.is_active) {
++	if (q->properties.is_active)
+ 		dqm->queue_count--;
+-		retval = execute_queues_cpsch(dqm,
+-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
+-		if (retval == -ETIME)
+-			qpd->reset_wavefronts = true;
+-	}
++
++	execute_queues_cpsch(dqm, false);
+ 
+ 	mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ 
+@@ -1322,7 +1012,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 
+ 	mutex_unlock(&dqm->lock);
+ 
+-	return retval;
++	return 0;
+ 
+ failed:
+ failed_try_destroy_debugged_queue:
+@@ -1346,10 +1036,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+ 				   void __user *alternate_aperture_base,
+ 				   uint64_t alternate_aperture_size)
+ {
+-	bool retval = true;
+-
+-	if (!dqm->asic_ops.set_cache_memory_policy)
+-		return retval;
++	bool retval;
+ 
+ 	mutex_lock(&dqm->lock);
+ 
+@@ -1381,7 +1068,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+ 		qpd->sh_mem_ape1_limit = limit >> 16;
+ 	}
+ 
+-	retval = dqm->asic_ops.set_cache_memory_policy(
++	retval = dqm->ops_asic_specific.set_cache_memory_policy(
+ 			dqm,
+ 			qpd,
+ 			default_policy,
+@@ -1389,7 +1076,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+ 			alternate_aperture_base,
+ 			alternate_aperture_size);
+ 
+-	if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0))
++	if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0))
+ 		program_sh_mem_settings(dqm, qpd);
+ 
+ 	pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n",
+@@ -1401,166 +1088,6 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
+ 	return retval;
+ }
+ 
+-static int set_trap_handler(struct device_queue_manager *dqm,
+-				struct qcm_process_device *qpd,
+-				uint64_t tba_addr,
+-				uint64_t tma_addr)
+-{
+-	uint64_t *tma;
+-
+-	if (dqm->dev->cwsr_enabled) {
+-		/* Jump from CWSR trap handler to user trap */
+-		tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+-		tma[0] = tba_addr;
+-		tma[1] = tma_addr;
+-	} else {
+-		qpd->tba_addr = tba_addr;
+-		qpd->tma_addr = tma_addr;
+-	}
+-
+-	return 0;
+-}
+-
+-static int process_termination_nocpsch(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd)
+-{
+-	struct queue *q, *next;
+-	struct device_process_node *cur, *next_dpn;
+-	int retval = 0;
+-
+-	mutex_lock(&dqm->lock);
+-
+-	/* Clear all user mode queues */
+-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+-		int ret;
+-
+-		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
+-		if (ret)
+-			retval = ret;
+-	}
+-
+-	/* Unregister process */
+-	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+-		if (qpd == cur->qpd) {
+-			list_del(&cur->list);
+-			kfree(cur);
+-			dqm->processes_count--;
+-			break;
+-		}
+-	}
+-
+-	mutex_unlock(&dqm->lock);
+-	return retval;
+-}
+-
+-static int get_wave_state(struct device_queue_manager *dqm,
+-			  struct queue *q,
+-			  void __user *ctl_stack,
+-			  u32 *ctl_stack_used_size,
+-			  u32 *save_area_used_size)
+-{
+-	struct mqd_manager *mqd;
+-	int r;
+-
+-	mutex_lock(&dqm->lock);
+-
+-	if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE ||
+-	    q->properties.is_active || !q->device->cwsr_enabled) {
+-		r = -EINVAL;
+-		goto dqm_unlock;
+-	}
+-
+-	mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+-	if (!mqd) {
+-		r = -ENOMEM;
+-		goto dqm_unlock;
+-	}
+-
+-	if (!mqd->get_wave_state) {
+-		r = -EINVAL;
+-		goto dqm_unlock;
+-	}
+-
+-	r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size,
+-				save_area_used_size);
+-
+-dqm_unlock:
+-	mutex_unlock(&dqm->lock);
+-	return r;
+-}
+-
+-static int process_termination_cpsch(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd)
+-{
+-	int retval;
+-	struct queue *q, *next;
+-	struct kernel_queue *kq, *kq_next;
+-	struct mqd_manager *mqd;
+-	struct device_process_node *cur, *next_dpn;
+-	enum kfd_unmap_queues_filter filter =
+-		KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+-
+-	retval = 0;
+-
+-	mutex_lock(&dqm->lock);
+-
+-	/* Clean all kernel queues */
+-	list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
+-		list_del(&kq->list);
+-		dqm->queue_count--;
+-		qpd->is_debug = false;
+-		dqm->total_queue_count--;
+-		filter = KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES;
+-	}
+-
+-	/* Clear all user mode queues */
+-	list_for_each_entry(q, &qpd->queues_list, list) {
+-		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+-			dqm->sdma_queue_count--;
+-			deallocate_sdma_queue(dqm, q->sdma_id);
+-		}
+-
+-		if (q->properties.is_active)
+-			dqm->queue_count--;
+-
+-		dqm->total_queue_count--;
+-	}
+-
+-	/* Unregister process */
+-	list_for_each_entry_safe(cur, next_dpn, &dqm->queues, list) {
+-		if (qpd == cur->qpd) {
+-			list_del(&cur->list);
+-			kfree(cur);
+-			dqm->processes_count--;
+-			break;
+-		}
+-	}
+-
+-	retval = execute_queues_cpsch(dqm, filter, 0);
+-	if (retval || qpd->reset_wavefronts) {
+-		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
+-		dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
+-		qpd->reset_wavefronts = false;
+-	}
+-
+-	/* lastly, free mqd resources */
+-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+-		mqd = dqm->ops.get_mqd_manager(dqm,
+-			get_mqd_type_from_queue_type(q->properties.type));
+-		if (!mqd) {
+-			retval = -ENOMEM;
+-			goto out;
+-		}
+-		list_del(&q->list);
+-		qpd->queue_count--;
+-		mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+-	}
+-
+-out:
+-	mutex_unlock(&dqm->lock);
+-	return retval;
+-}
+-
+ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ {
+ 	struct device_queue_manager *dqm;
+@@ -1571,18 +1098,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 	if (!dqm)
+ 		return NULL;
+ 
+-	switch (dev->device_info->asic_family) {
+-	case CHIP_HAWAII:
+-	case CHIP_TONGA:
+-		dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS;
+-		break;
+-	default:
+-		dqm->sched_policy = sched_policy;
+-		break;
+-	}
+-
+ 	dqm->dev = dev;
+-	switch (dqm->sched_policy) {
++	switch (sched_policy) {
+ 	case KFD_SCHED_POLICY_HWS:
+ 	case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION:
+ 		/* initialize dqm for cp scheduling */
+@@ -1592,16 +1109,13 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		dqm->ops.stop = stop_cpsch;
+ 		dqm->ops.destroy_queue = destroy_queue_cpsch;
+ 		dqm->ops.update_queue = update_queue;
+-		dqm->ops.get_mqd_manager = get_mqd_manager;
+-		dqm->ops.register_process = register_process;
+-		dqm->ops.unregister_process = unregister_process;
+-		dqm->ops.uninitialize = uninitialize;
++		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
++		dqm->ops.register_process = register_process_nocpsch;
++		dqm->ops.unregister_process = unregister_process_nocpsch;
++		dqm->ops.uninitialize = uninitialize_nocpsch;
+ 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
+ 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
+ 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+-		dqm->ops.set_trap_handler = set_trap_handler;
+-		dqm->ops.process_termination = process_termination_cpsch;
+-		dqm->ops.get_wave_state = get_wave_state;
+ 		break;
+ 	case KFD_SCHED_POLICY_NO_HWS:
+ 		/* initialize dqm for no cp scheduling */
+@@ -1610,49 +1124,26 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
+ 		dqm->ops.create_queue = create_queue_nocpsch;
+ 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
+ 		dqm->ops.update_queue = update_queue;
+-		dqm->ops.get_mqd_manager = get_mqd_manager;
+-		dqm->ops.register_process = register_process;
+-		dqm->ops.unregister_process = unregister_process;
++		dqm->ops.get_mqd_manager = get_mqd_manager_nocpsch;
++		dqm->ops.register_process = register_process_nocpsch;
++		dqm->ops.unregister_process = unregister_process_nocpsch;
+ 		dqm->ops.initialize = initialize_nocpsch;
+-		dqm->ops.uninitialize = uninitialize;
++		dqm->ops.uninitialize = uninitialize_nocpsch;
+ 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+-		dqm->ops.set_trap_handler = set_trap_handler;
+-		dqm->ops.process_termination = process_termination_nocpsch;
+-		dqm->ops.get_wave_state = get_wave_state;
+ 		break;
+ 	default:
+-		pr_err("Invalid scheduling policy %d\n", dqm->sched_policy);
++		pr_err("Invalid scheduling policy %d\n", sched_policy);
+ 		goto out_free;
+ 	}
+ 
+ 	switch (dev->device_info->asic_family) {
+ 	case CHIP_CARRIZO:
+-		device_queue_manager_init_vi(&dqm->asic_ops);
++		device_queue_manager_init_vi(&dqm->ops_asic_specific);
+ 		break;
+ 
+ 	case CHIP_KAVERI:
+-		device_queue_manager_init_cik(&dqm->asic_ops);
++		device_queue_manager_init_cik(&dqm->ops_asic_specific);
+ 		break;
+-
+-	case CHIP_HAWAII:
+-		device_queue_manager_init_cik_hawaii(&dqm->asic_ops);
+-		break;
+-
+-	case CHIP_TONGA:
+-	case CHIP_FIJI:
+-	case CHIP_POLARIS10:
+-	case CHIP_POLARIS11:
+-		device_queue_manager_init_vi_tonga(&dqm->asic_ops);
+-		break;
+-
+-	case CHIP_VEGA10:
+-	case CHIP_RAVEN:
+-		device_queue_manager_init_v9_vega10(&dqm->asic_ops);
+-		break;
+-	default:
+-		WARN(1, "Unexpected ASIC family %u",
+-		     dev->device_info->asic_family);
+-		goto out_free;
+ 	}
+ 
+ 	if (!dqm->ops.initialize(dqm))
+@@ -1668,87 +1159,3 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
+ 	dqm->ops.uninitialize(dqm);
+ 	kfree(dqm);
+ }
+-
+-int kfd_process_vm_fault(struct device_queue_manager *dqm,
+-			 unsigned int pasid)
+-{
+-	struct kfd_process_device *pdd;
+-	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+-	int ret = 0;
+-
+-	if (!p)
+-		return -EINVAL;
+-	pdd = kfd_get_process_device_data(dqm->dev, p);
+-	if (pdd)
+-		ret = process_evict_queues(dqm, &pdd->qpd);
+-	kfd_unref_process(p);
+-
+-	return ret;
+-}
+-
+-static void seq_reg_dump(struct seq_file *m,
+-			 uint32_t (*dump)[2], uint32_t n_regs)
+-{
+-	uint32_t i, count;
+-
+-	for (i = 0, count = 0; i < n_regs; i++) {
+-		if (count == 0 ||
+-		    dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) {
+-			seq_printf(m, "%s    %08x: %08x",
+-				   i ? "\n" : "",
+-				   dump[i][0], dump[i][1]);
+-			count = 7;
+-		} else {
+-			seq_printf(m, " %08x", dump[i][1]);
+-			count--;
+-		}
+-	}
+-
+-	seq_puts(m, "\n");
+-}
+-
+-int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data)
+-{
+-	struct device_queue_manager *dqm = data;
+-	uint32_t (*dump)[2], n_regs;
+-	int pipe, queue;
+-	int r = 0;
+-
+-	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
+-		int pipe_offset = pipe * get_queues_per_pipe(dqm);
+-
+-		for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) {
+-			if (!test_bit(pipe_offset + queue,
+-				      dqm->dev->shared_resources.queue_bitmap))
+-				continue;
+-
+-			r = dqm->dev->kfd2kgd->hqd_dump(
+-				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+-			if (r)
+-				break;
+-
+-			seq_printf(m, "  CP Pipe %d, Queue %d\n",
+-				  pipe, queue);
+-			seq_reg_dump(m, dump, n_regs);
+-
+-			kfree(dump);
+-		}
+-	}
+-
+-	for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
+-		for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) {
+-			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
+-				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+-			if (r)
+-				break;
+-
+-			seq_printf(m, "  SDMA Engine %d, RLC %d\n",
+-				  pipe, queue);
+-			seq_reg_dump(m, dump, n_regs);
+-
+-			kfree(dump);
+-		}
+-	}
+-
+-	return r;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index 9785680..faf820a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -29,10 +29,14 @@
+ #include "kfd_priv.h"
+ #include "kfd_mqd_manager.h"
+ 
+-#define KFD_UNMAP_LATENCY_MS			(4000)
+-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
+-
+-#define KFD_SDMA_QUEUES_PER_ENGINE		(2)
++#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS	(500)
++#define CIK_VMID_NUM				(8)
++#define KFD_VMID_START_OFFSET			(8)
++#define VMID_PER_DEVICE				CIK_VMID_NUM
++#define KFD_DQM_FIRST_PIPE			(0)
++#define CIK_SDMA_QUEUES				(4)
++#define CIK_SDMA_QUEUES_PER_ENGINE		(2)
++#define CIK_SDMA_ENGINE_NUM			(2)
+ 
+ struct device_process_node {
+ 	struct qcm_process_device *qpd;
+@@ -75,16 +79,13 @@ struct device_process_node {
+  * @set_cache_memory_policy: Sets memory policy (cached/ non cached) for the
+  * memory apertures.
+  *
+- * @process_termination: Clears all process queues belongs to that device.
+- *
+- * @get_wave_state: Retrieves context save state and optionally copies the
+- * control stack, if kept in the MQD, to the given userspace address.
+  */
+ 
+ struct device_queue_manager_ops {
+ 	int	(*create_queue)(struct device_queue_manager *dqm,
+ 				struct queue *q,
+-				struct qcm_process_device *qpd);
++				struct qcm_process_device *qpd,
++				int *allocate_vmid);
+ 
+ 	int	(*destroy_queue)(struct device_queue_manager *dqm,
+ 				struct qcm_process_device *qpd,
+@@ -121,25 +122,12 @@ struct device_queue_manager_ops {
+ 					   enum cache_policy alternate_policy,
+ 					   void __user *alternate_aperture_base,
+ 					   uint64_t alternate_aperture_size);
+-
+-	int	(*set_trap_handler)(struct device_queue_manager *dqm,
+-				    struct qcm_process_device *qpd,
+-				    uint64_t tba_addr,
+-				    uint64_t tma_addr);
+-
+-	int (*process_termination)(struct device_queue_manager *dqm,
+-			struct qcm_process_device *qpd);
+-
+-	int	(*get_wave_state)(struct device_queue_manager *dqm,
+-				  struct queue *q,
+-				  void __user *ctl_stack,
+-				  u32 *ctl_stack_used_size,
+-				  u32 *save_area_used_size);
+ };
+ 
+ struct device_queue_manager_asic_ops {
+-	int	(*update_qpd)(struct device_queue_manager *dqm,
++	int	(*register_process)(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
++	int	(*initialize)(struct device_queue_manager *dqm);
+ 	bool	(*set_cache_memory_policy)(struct device_queue_manager *dqm,
+ 					   struct qcm_process_device *qpd,
+ 					   enum cache_policy default_policy,
+@@ -165,7 +153,7 @@ struct device_queue_manager_asic_ops {
+ 
+ struct device_queue_manager {
+ 	struct device_queue_manager_ops ops;
+-	struct device_queue_manager_asic_ops asic_ops;
++	struct device_queue_manager_asic_ops ops_asic_specific;
+ 
+ 	struct mqd_manager	*mqds[KFD_MQD_TYPE_MAX];
+ 	struct packet_manager	packets;
+@@ -186,38 +174,21 @@ struct device_queue_manager {
+ 	unsigned int		*fence_addr;
+ 	struct kfd_mem_obj	*fence_mem;
+ 	bool			active_runlist;
+-	int			sched_policy;
+ };
+ 
+-void device_queue_manager_init_cik(
+-		struct device_queue_manager_asic_ops *asic_ops);
+-void device_queue_manager_init_cik_hawaii(
+-		struct device_queue_manager_asic_ops *asic_ops);
+-void device_queue_manager_init_vi(
+-		struct device_queue_manager_asic_ops *asic_ops);
+-void device_queue_manager_init_vi_tonga(
+-		struct device_queue_manager_asic_ops *asic_ops);
+-void device_queue_manager_init_v9_vega10(
+-		struct device_queue_manager_asic_ops *asic_ops);
++void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops);
++void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops);
+ void program_sh_mem_settings(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
+ unsigned int get_queues_num(struct device_queue_manager *dqm);
+ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
+ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
+-unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+-
+-int process_evict_queues(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd);
+-int process_restore_queues(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd);
+-
+ 
+ static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
+ {
+ 	return (pdd->lds_base >> 16) & 0xFF;
+ }
+ 
+-/* This function is only useful for GFXv7 and v8 */
+ static inline unsigned int
+ get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd)
+ {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+index aed4c21..72c3cba 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+@@ -32,30 +32,18 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
+ 				   enum cache_policy alternate_policy,
+ 				   void __user *alternate_aperture_base,
+ 				   uint64_t alternate_aperture_size);
+-static int update_qpd_cik(struct device_queue_manager *dqm,
+-					struct qcm_process_device *qpd);
+-static int update_qpd_cik_hawaii(struct device_queue_manager *dqm,
++static int register_process_cik(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
++static int initialize_cpsch_cik(struct device_queue_manager *dqm);
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd);
+-static void init_sdma_vm_hawaii(struct device_queue_manager *dqm,
+-				struct queue *q,
+-				struct qcm_process_device *qpd);
+-
+-void device_queue_manager_init_cik(
+-		struct device_queue_manager_asic_ops *asic_ops)
+-{
+-	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
+-	asic_ops->update_qpd = update_qpd_cik;
+-	asic_ops->init_sdma_vm = init_sdma_vm;
+-}
+ 
+-void device_queue_manager_init_cik_hawaii(
+-		struct device_queue_manager_asic_ops *asic_ops)
++void device_queue_manager_init_cik(struct device_queue_manager_asic_ops *ops)
+ {
+-	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
+-	asic_ops->update_qpd = update_qpd_cik_hawaii;
+-	asic_ops->init_sdma_vm = init_sdma_vm_hawaii;
++	ops->set_cache_memory_policy = set_cache_memory_policy_cik;
++	ops->register_process = register_process_cik;
++	ops->initialize = initialize_cpsch_cik;
++	ops->init_sdma_vm = init_sdma_vm;
+ }
+ 
+ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
+@@ -111,7 +99,7 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm,
+ 	return true;
+ }
+ 
+-static int update_qpd_cik(struct device_queue_manager *dqm,
++static int register_process_cik(struct device_queue_manager *dqm,
+ 		struct qcm_process_device *qpd)
+ {
+ 	struct kfd_process_device *pdd;
+@@ -145,36 +133,6 @@ static int update_qpd_cik(struct device_queue_manager *dqm,
+ 	return 0;
+ }
+ 
+-static int update_qpd_cik_hawaii(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd)
+-{
+-	struct kfd_process_device *pdd;
+-	unsigned int temp;
+-
+-	pdd = qpd_to_pdd(qpd);
+-
+-	/* check if sh_mem_config register already configured */
+-	if (qpd->sh_mem_config == 0) {
+-		qpd->sh_mem_config =
+-			ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) |
+-			DEFAULT_MTYPE(MTYPE_NONCACHED) |
+-			APE1_MTYPE(MTYPE_NONCACHED);
+-		qpd->sh_mem_ape1_limit = 0;
+-		qpd->sh_mem_ape1_base = 0;
+-	}
+-
+-	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+-	 * aperture addresses.
+-	 */
+-	temp = get_sh_mem_bases_nybble_64(pdd);
+-	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+-
+-	pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n",
+-		qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases);
+-
+-	return 0;
+-}
+-
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd)
+ {
+@@ -191,15 +149,7 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 	q->properties.sdma_vm_addr = value;
+ }
+ 
+-static void init_sdma_vm_hawaii(struct device_queue_manager *dqm,
+-				struct queue *q,
+-				struct qcm_process_device *qpd)
++static int initialize_cpsch_cik(struct device_queue_manager *dqm)
+ {
+-	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+-	 * aperture addresses.
+-	 */
+-	q->properties.sdma_vm_addr =
+-		((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+-		 SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+-		SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
++	return 0;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+deleted file mode 100644
+index 9c6c83a9..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
++++ /dev/null
+@@ -1,83 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-#include "kfd_device_queue_manager.h"
+-#include "vega10_enum.h"
+-#include "gc/gc_9_0_offset.h"
+-#include "gc/gc_9_0_sh_mask.h"
+-#include "sdma0/sdma0_4_0_sh_mask.h"
+-
+-static int update_qpd_v9(struct device_queue_manager *dqm,
+-			 struct qcm_process_device *qpd);
+-static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+-			    struct qcm_process_device *qpd);
+-
+-void device_queue_manager_init_v9_vega10(
+-	struct device_queue_manager_asic_ops *asic_ops)
+-{
+-	asic_ops->update_qpd = update_qpd_v9;
+-	asic_ops->init_sdma_vm = init_sdma_vm_v9;
+-}
+-
+-static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+-{
+-	uint32_t shared_base = pdd->lds_base >> 48;
+-	uint32_t private_base = pdd->scratch_base >> 48;
+-
+-	return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+-		private_base;
+-}
+-
+-static int update_qpd_v9(struct device_queue_manager *dqm,
+-			 struct qcm_process_device *qpd)
+-{
+-	struct kfd_process_device *pdd;
+-
+-	pdd = qpd_to_pdd(qpd);
+-
+-	/* check if sh_mem_config register already configured */
+-	if (qpd->sh_mem_config == 0) {
+-		qpd->sh_mem_config =
+-				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+-					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+-		if (vega10_noretry)
+-			qpd->sh_mem_config |=
+-				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+-
+-		qpd->sh_mem_ape1_limit = 0;
+-		qpd->sh_mem_ape1_base = 0;
+-	}
+-
+-	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+-
+-	pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+-
+-	return 0;
+-}
+-
+-static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q,
+-			    struct qcm_process_device *qpd)
+-{
+-	/* Not needed on SDMAv4 any more */
+-	q->properties.sdma_vm_addr = 0;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+index 030b014..40e9ddd 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+@@ -33,41 +33,18 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+ 				   enum cache_policy alternate_policy,
+ 				   void __user *alternate_aperture_base,
+ 				   uint64_t alternate_aperture_size);
+-static int update_qpd_vi(struct device_queue_manager *dqm,
++static int register_process_vi(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd);
++static int initialize_cpsch_vi(struct device_queue_manager *dqm);
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd);
+ 
+-/*
+- * Tonga device queue manager functions
+- */
+-static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+-			struct qcm_process_device *qpd,
+-			enum cache_policy default_policy,
+-			enum cache_policy alternate_policy,
+-			void __user *alternate_aperture_base,
+-			uint64_t alternate_aperture_size);
+-static int update_qpd_vi_tonga(struct device_queue_manager *dqm,
+-			struct qcm_process_device *qpd);
+-static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+-			struct queue *q,
+-			struct qcm_process_device *qpd);
+-
+-void device_queue_manager_init_vi_tonga(
+-		struct device_queue_manager_asic_ops *asic_ops)
++void device_queue_manager_init_vi(struct device_queue_manager_asic_ops *ops)
+ {
+-	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
+-	asic_ops->update_qpd = update_qpd_vi_tonga;
+-	asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+-}
+-
+-
+-void device_queue_manager_init_vi(
+-		struct device_queue_manager_asic_ops *asic_ops)
+-{
+-	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
+-	asic_ops->update_qpd = update_qpd_vi;
+-	asic_ops->init_sdma_vm = init_sdma_vm;
++	ops->set_cache_memory_policy = set_cache_memory_policy_vi;
++	ops->register_process = register_process_vi;
++	ops->initialize = initialize_cpsch_vi;
++	ops->init_sdma_vm = init_sdma_vm;
+ }
+ 
+ static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
+@@ -127,34 +104,7 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm,
+ 	return true;
+ }
+ 
+-static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm,
+-		struct qcm_process_device *qpd,
+-		enum cache_policy default_policy,
+-		enum cache_policy alternate_policy,
+-		void __user *alternate_aperture_base,
+-		uint64_t alternate_aperture_size)
+-{
+-	uint32_t default_mtype;
+-	uint32_t ape1_mtype;
+-
+-	default_mtype = (default_policy == cache_policy_coherent) ?
+-			MTYPE_UC :
+-			MTYPE_NC;
+-
+-	ape1_mtype = (alternate_policy == cache_policy_coherent) ?
+-			MTYPE_UC :
+-			MTYPE_NC;
+-
+-	qpd->sh_mem_config =
+-			SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+-				   SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+-			default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+-			ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+-
+-	return true;
+-}
+-
+-static int update_qpd_vi(struct device_queue_manager *dqm,
++static int register_process_vi(struct device_queue_manager *dqm,
+ 					struct qcm_process_device *qpd)
+ {
+ 	struct kfd_process_device *pdd;
+@@ -195,40 +145,6 @@ static int update_qpd_vi(struct device_queue_manager *dqm,
+ 	return 0;
+ }
+ 
+-static int update_qpd_vi_tonga(struct device_queue_manager *dqm,
+-			struct qcm_process_device *qpd)
+-{
+-	struct kfd_process_device *pdd;
+-	unsigned int temp;
+-
+-	pdd = qpd_to_pdd(qpd);
+-
+-	/* check if sh_mem_config register already configured */
+-	if (qpd->sh_mem_config == 0) {
+-		qpd->sh_mem_config =
+-				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+-					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT |
+-				MTYPE_UC <<
+-					SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT |
+-				MTYPE_UC <<
+-					SH_MEM_CONFIG__APE1_MTYPE__SHIFT;
+-
+-		qpd->sh_mem_ape1_limit = 0;
+-		qpd->sh_mem_ape1_base = 0;
+-	}
+-
+-	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+-	 * aperture addresses.
+-	 */
+-	temp = get_sh_mem_bases_nybble_64(pdd);
+-	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp);
+-
+-	pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n",
+-		temp, qpd->sh_mem_bases);
+-
+-	return 0;
+-}
+-
+ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 				struct qcm_process_device *qpd)
+ {
+@@ -245,15 +161,7 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q,
+ 	q->properties.sdma_vm_addr = value;
+ }
+ 
+-static void init_sdma_vm_tonga(struct device_queue_manager *dqm,
+-			struct queue *q,
+-			struct qcm_process_device *qpd)
++static int initialize_cpsch_vi(struct device_queue_manager *dqm)
+ {
+-	/* On dGPU we're always in GPUVM64 addressing mode with 64-bit
+-	 * aperture addresses.
+-	 */
+-	q->properties.sdma_vm_addr =
+-		((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) <<
+-		 SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) &
+-		SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK;
++	return 0;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+index fc41689..acf4d2a 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+@@ -24,15 +24,17 @@
+ #include <linux/mman.h>
+ #include <linux/slab.h>
+ #include <linux/io.h>
+-#include <linux/idr.h>
+ 
+ /*
+- * This extension supports a kernel level doorbells management for the
+- * kernel queues using the first doorbell page reserved for the kernel.
++ * This extension supports a kernel level doorbells management for
++ * the kernel queues.
++ * Basically the last doorbells page is devoted to kernel queues
++ * and that's assures that any user process won't get access to the
++ * kernel doorbells page
+  */
+ 
+-static DEFINE_IDA(doorbell_ida);
+-static unsigned int max_doorbell_slices;
++#define KERNEL_DOORBELL_PASID 1
++#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4
+ 
+ /*
+  * Each device exposes a doorbell aperture, a PCI MMIO aperture that
+@@ -49,9 +51,9 @@ static unsigned int max_doorbell_slices;
+  */
+ 
+ /* # of doorbell bytes allocated for each process. */
+-size_t kfd_doorbell_process_slice(struct kfd_dev *kfd)
++static inline size_t doorbell_process_allocation(void)
+ {
+-	return roundup(kfd->device_info->doorbell_size *
++	return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES *
+ 			KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+ 			PAGE_SIZE);
+ }
+@@ -71,30 +73,27 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
+ 
+ 	doorbell_start_offset =
+ 			roundup(kfd->shared_resources.doorbell_start_offset,
+-					kfd_doorbell_process_slice(kfd));
++					doorbell_process_allocation());
+ 
+ 	doorbell_aperture_size =
+ 			rounddown(kfd->shared_resources.doorbell_aperture_size,
+-					kfd_doorbell_process_slice(kfd));
++					doorbell_process_allocation());
+ 
+ 	if (doorbell_aperture_size > doorbell_start_offset)
+ 		doorbell_process_limit =
+ 			(doorbell_aperture_size - doorbell_start_offset) /
+-						kfd_doorbell_process_slice(kfd);
++						doorbell_process_allocation();
+ 	else
+-		return -ENOSPC;
+-
+-	if (!max_doorbell_slices ||
+-	    doorbell_process_limit < max_doorbell_slices)
+-		max_doorbell_slices = doorbell_process_limit;
++		doorbell_process_limit = 0;
+ 
+ 	kfd->doorbell_base = kfd->shared_resources.doorbell_physical_address +
+ 				doorbell_start_offset;
+ 
+ 	kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32);
++	kfd->doorbell_process_limit = doorbell_process_limit - 1;
+ 
+ 	kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base,
+-					   kfd_doorbell_process_slice(kfd));
++						doorbell_process_allocation());
+ 
+ 	if (!kfd->doorbell_kernel_ptr)
+ 		return -ENOMEM;
+@@ -115,7 +114,8 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
+ 	pr_debug("doorbell aperture size  == 0x%08lX\n",
+ 			kfd->shared_resources.doorbell_aperture_size);
+ 
+-	pr_debug("doorbell kernel address == 0x%p\n", kfd->doorbell_kernel_ptr);
++	pr_debug("doorbell kernel address == 0x%08lX\n",
++			(uintptr_t)kfd->doorbell_kernel_ptr);
+ 
+ 	return 0;
+ }
+@@ -126,16 +126,21 @@ void kfd_doorbell_fini(struct kfd_dev *kfd)
+ 		iounmap(kfd->doorbell_kernel_ptr);
+ }
+ 
+-int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+-		      struct vm_area_struct *vma)
++int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
+ {
+ 	phys_addr_t address;
++	struct kfd_dev *dev;
+ 
+ 	/*
+ 	 * For simplicitly we only allow mapping of the entire doorbell
+ 	 * allocation of a single device & process.
+ 	 */
+-	if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev))
++	if (vma->vm_end - vma->vm_start != doorbell_process_allocation())
++		return -EINVAL;
++
++	/* Find kfd device according to gpu id */
++	dev = kfd_device_by_id(vma->vm_pgoff);
++	if (!dev)
+ 		return -EINVAL;
+ 
+ 	/* Calculate physical address of doorbell */
+@@ -152,19 +157,19 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process,
+ 		 "     vm_flags            == 0x%04lX\n"
+ 		 "     size                == 0x%04lX\n",
+ 		 (unsigned long long) vma->vm_start, address, vma->vm_flags,
+-		 kfd_doorbell_process_slice(dev));
++		 doorbell_process_allocation());
+ 
+ 
+ 	return io_remap_pfn_range(vma,
+ 				vma->vm_start,
+ 				address >> PAGE_SHIFT,
+-				kfd_doorbell_process_slice(dev),
++				doorbell_process_allocation(),
+ 				vma->vm_page_prot);
+ }
+ 
+ 
+ /* get kernel iomem pointer for a doorbell */
+-void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
++u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+ 					unsigned int *doorbell_off)
+ {
+ 	u32 inx;
+@@ -179,18 +184,17 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+ 	if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS)
+ 		return NULL;
+ 
+-	inx *= kfd->device_info->doorbell_size / sizeof(u32);
+-
+ 	/*
+-	 * Calculating the kernel doorbell offset using the first
+-	 * doorbell page.
++	 * Calculating the kernel doorbell offset using "faked" kernel
++	 * pasid that allocated for kernel queues only
+ 	 */
+-	*doorbell_off = kfd->doorbell_id_offset + inx;
++	*doorbell_off = KERNEL_DOORBELL_PASID * (doorbell_process_allocation() /
++							sizeof(u32)) + inx;
+ 
+ 	pr_debug("Get kernel queue doorbell\n"
+ 			 "     doorbell offset   == 0x%08X\n"
+-			 "     kernel address    == 0x%p\n",
+-		*doorbell_off, (kfd->doorbell_kernel_ptr + inx));
++			 "     kernel address    == 0x%08lX\n",
++		*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
+ 
+ 	return kfd->doorbell_kernel_ptr + inx;
+ }
+@@ -206,7 +210,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr)
+ 	mutex_unlock(&kfd->doorbell_mutex);
+ }
+ 
+-void write_kernel_doorbell(void __iomem *db, u32 value)
++inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
+ {
+ 	if (db) {
+ 		writel(value, db);
+@@ -214,40 +218,29 @@ void write_kernel_doorbell(void __iomem *db, u32 value)
+ 	}
+ }
+ 
+-void write_kernel_doorbell64(void __iomem *db, u64 value)
+-{
+-	if (db) {
+-		WARN(((unsigned long)db & 7) != 0,
+-		     "Unaligned 64-bit doorbell");
+-		writeq(value, (u64 __iomem *)db);
+-		pr_debug("writing %llu to doorbell address 0x%p\n", value, db);
+-	}
+-}
+-
+ /*
+  * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1
+  * to doorbells with the process's doorbell page
+  */
+-unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
++unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
+ 					struct kfd_process *process,
+-					unsigned int doorbell_id)
++					unsigned int queue_id)
+ {
+ 	/*
+ 	 * doorbell_id_offset accounts for doorbells taken by KGD.
+-	 * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to
+-	 * the process's doorbells. The offset returned is in dword
+-	 * units regardless of the ASIC-dependent doorbell size.
++	 * pasid * doorbell_process_allocation/sizeof(u32) adjusts
++	 * to the process's doorbells
+ 	 */
+ 	return kfd->doorbell_id_offset +
+-		process->doorbell_index * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) +
+-		doorbell_id * kfd->device_info->doorbell_size / sizeof(u32);
++		process->pasid * (doorbell_process_allocation()/sizeof(u32)) +
++		queue_id;
+ }
+ 
+ uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
+ {
+ 	uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size -
+ 				kfd->shared_resources.doorbell_start_offset) /
+-					kfd_doorbell_process_slice(kfd) + 1;
++					doorbell_process_allocation() + 1;
+ 
+ 	return num_of_elems;
+ 
+@@ -257,21 +250,5 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
+ 					struct kfd_process *process)
+ {
+ 	return dev->doorbell_base +
+-		process->doorbell_index * kfd_doorbell_process_slice(dev);
+-}
+-
+-int kfd_alloc_process_doorbells(struct kfd_process *process)
+-{
+-	int r = ida_simple_get(&doorbell_ida, 1, max_doorbell_slices,
+-				GFP_KERNEL);
+-	if (r > 0)
+-		process->doorbell_index = r;
+-
+-	return r;
+-}
+-
+-void kfd_free_process_doorbells(struct kfd_process *process)
+-{
+-	if (process->doorbell_index)
+-		ida_simple_remove(&doorbell_ida, process->doorbell_index);
++		process->pasid * doorbell_process_allocation();
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+index ee3c288..5979158 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+@@ -23,9 +23,9 @@
+ #include <linux/mm_types.h>
+ #include <linux/slab.h>
+ #include <linux/types.h>
+-#include <linux/sched/mm.h>
+ #include <linux/sched/signal.h>
+ #include <linux/uaccess.h>
++#include <linux/mm.h>
+ #include <linux/mman.h>
+ #include <linux/memory.h>
+ #include "kfd_priv.h"
+@@ -33,105 +33,185 @@
+ #include <linux/device.h>
+ 
+ /*
+- * Wrapper around wait_queue_entry_t
++ * A task can only be on a single wait_queue at a time, but we need to support
++ * waiting on multiple events (any/all).
++ * Instead of each event simply having a wait_queue with sleeping tasks, it
++ * has a singly-linked list of tasks.
++ * A thread that wants to sleep creates an array of these, one for each event
++ * and adds one to each event's waiter chain.
+  */
+ struct kfd_event_waiter {
+-	wait_queue_entry_t wait;
+-	struct kfd_event *event; /* Event to wait for */
+-	bool activated;		 /* Becomes true when event is signaled */
++	struct list_head waiters;
++	struct task_struct *sleeping_task;
++
++	/* Transitions to true when the event this belongs to is signaled. */
++	bool activated;
++
++	/* Event */
++	struct kfd_event *event;
++	uint32_t input_index;
+ };
+ 
+ /*
++ * Over-complicated pooled allocator for event notification slots.
++ *
+  * Each signal event needs a 64-bit signal slot where the signaler will write
+- * a 1 before sending an interrupt. (This is needed because some interrupts
++ * a 1 before sending an interrupt.l (This is needed because some interrupts
+  * do not contain enough spare data bits to identify an event.)
+- * We get whole pages and map them to the process VA.
+- * Individual signal events use their event_id as slot index.
++ * We get whole pages from vmalloc and map them to the process VA.
++ * Individual signal events are then allocated a slot in a page.
+  */
+-struct kfd_signal_page {
++
++struct signal_page {
++	struct list_head event_pages;	/* kfd_process.signal_event_pages */
+ 	uint64_t *kernel_address;
+-	uint64_t handle;
+ 	uint64_t __user *user_address;
++	uint32_t page_index;		/* Index into the mmap aperture. */
++	unsigned int free_slots;
++	unsigned long used_slot_bitmap[0];
+ };
+ 
++#define SLOTS_PER_PAGE KFD_SIGNAL_EVENT_LIMIT
++#define SLOT_BITMAP_SIZE BITS_TO_LONGS(SLOTS_PER_PAGE)
++#define BITS_PER_PAGE (ilog2(SLOTS_PER_PAGE)+1)
++#define SIGNAL_PAGE_SIZE (sizeof(struct signal_page) + \
++				SLOT_BITMAP_SIZE * sizeof(long))
+ 
+-static uint64_t *page_slots(struct kfd_signal_page *page)
++/*
++ * For signal events, the event ID is used as the interrupt user data.
++ * For SQ s_sendmsg interrupts, this is limited to 8 bits.
++ */
++
++#define INTERRUPT_DATA_BITS 8
++#define SIGNAL_EVENT_ID_SLOT_SHIFT 0
++
++static uint64_t *page_slots(struct signal_page *page)
+ {
+ 	return page->kernel_address;
+ }
+ 
+-static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
++static bool allocate_free_slot(struct kfd_process *process,
++				struct signal_page **out_page,
++				unsigned int *out_slot_index)
++{
++	struct signal_page *page;
++
++	list_for_each_entry(page, &process->signal_event_pages, event_pages) {
++		if (page->free_slots > 0) {
++			unsigned int slot =
++				find_first_zero_bit(page->used_slot_bitmap,
++							SLOTS_PER_PAGE);
++
++			__set_bit(slot, page->used_slot_bitmap);
++			page->free_slots--;
++
++			page_slots(page)[slot] = UNSIGNALED_EVENT_SLOT;
++
++			*out_page = page;
++			*out_slot_index = slot;
++
++			pr_debug("Allocated event signal slot in page %p, slot %d\n",
++					page, slot);
++
++			return true;
++		}
++	}
++
++	pr_debug("No free event signal slots were found for process %p\n",
++			process);
++
++	return false;
++}
++
++#define list_tail_entry(head, type, member) \
++	list_entry((head)->prev, type, member)
++
++static bool allocate_signal_page(struct file *devkfd, struct kfd_process *p)
+ {
+ 	void *backing_store;
+-	struct kfd_signal_page *page;
++	struct signal_page *page;
+ 
+-	page = kzalloc(sizeof(*page), GFP_KERNEL);
++	page = kzalloc(SIGNAL_PAGE_SIZE, GFP_KERNEL);
+ 	if (!page)
+-		return NULL;
++		goto fail_alloc_signal_page;
++
++	page->free_slots = SLOTS_PER_PAGE;
+ 
+-	backing_store = (void *) __get_free_pages(GFP_KERNEL,
++	backing_store = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ 					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ 	if (!backing_store)
+ 		goto fail_alloc_signal_store;
+ 
+-	/* Initialize all events to unsignaled */
++	/* prevent user-mode info leaks */
+ 	memset(backing_store, (uint8_t) UNSIGNALED_EVENT_SLOT,
+-	       KFD_SIGNAL_EVENT_LIMIT * 8);
++		KFD_SIGNAL_EVENT_LIMIT * 8);
+ 
+ 	page->kernel_address = backing_store;
++
++	if (list_empty(&p->signal_event_pages))
++		page->page_index = 0;
++	else
++		page->page_index = list_tail_entry(&p->signal_event_pages,
++						   struct signal_page,
++						   event_pages)->page_index + 1;
++
+ 	pr_debug("Allocated new event signal page at %p, for process %p\n",
+ 			page, p);
++	pr_debug("Page index is %d\n", page->page_index);
++
++	list_add(&page->event_pages, &p->signal_event_pages);
+ 
+-	return page;
++	return true;
+ 
+ fail_alloc_signal_store:
+ 	kfree(page);
+-	return NULL;
++fail_alloc_signal_page:
++	return false;
+ }
+ 
+-static int allocate_event_notification_slot(struct kfd_process *p,
+-					    struct kfd_event *ev)
++static bool allocate_event_notification_slot(struct file *devkfd,
++					struct kfd_process *p,
++					struct signal_page **page,
++					unsigned int *signal_slot_index)
+ {
+-	int id;
++	bool ret;
+ 
+-	if (!p->signal_page) {
+-		p->signal_page = allocate_signal_page(p);
+-		if (!p->signal_page)
+-			return -ENOMEM;
++	ret = allocate_free_slot(p, page, signal_slot_index);
++	if (!ret) {
++		ret = allocate_signal_page(devkfd, p);
++		if (ret)
++			ret = allocate_free_slot(p, page, signal_slot_index);
+ 	}
+ 
+-	id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT,
+-		       GFP_KERNEL);
+-	if (id < 0)
+-		return id;
+-
+-	ev->event_id = id;
+-	page_slots(p->signal_page)[id] = UNSIGNALED_EVENT_SLOT;
+-
+-	return 0;
++	return ret;
+ }
+ 
+-static struct kfd_signal_page *allocate_signal_page_dgpu(
+-	struct kfd_process *p, uint64_t *kernel_address, uint64_t handle)
++/* Assumes that the process's event_mutex is locked. */
++static void release_event_notification_slot(struct signal_page *page,
++						size_t slot_index)
+ {
+-	struct kfd_signal_page *my_page;
++	__clear_bit(slot_index, page->used_slot_bitmap);
++	page->free_slots++;
+ 
+-	my_page = kzalloc(sizeof(*my_page), GFP_KERNEL);
+-	if (!my_page)
+-		return NULL;
+-
+-	/* Initialize all events to unsignaled */
+-	memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
+-	       KFD_SIGNAL_EVENT_LIMIT * 8);
++	/* We don't free signal pages, they are retained by the process
++	 * and reused until it exits.
++	 */
++}
+ 
+-	my_page->kernel_address = kernel_address;
+-	my_page->handle = handle;
+-	my_page->user_address = NULL;
++static struct signal_page *lookup_signal_page_by_index(struct kfd_process *p,
++						unsigned int page_index)
++{
++	struct signal_page *page;
+ 
+-	pr_debug("Allocated new event signal page at %p, for process %p\n",
+-			my_page, p);
++	/*
++	 * This is safe because we don't delete signal pages until the
++	 * process exits.
++	 */
++	list_for_each_entry(page, &p->signal_event_pages, event_pages)
++		if (page->page_index == page_index)
++			return page;
+ 
+-	return my_page;
++	return NULL;
+ }
+ 
+ /*
+@@ -140,80 +220,96 @@ static struct kfd_signal_page *allocate_signal_page_dgpu(
+  */
+ static struct kfd_event *lookup_event_by_id(struct kfd_process *p, uint32_t id)
+ {
+-	return idr_find(&p->event_idr, id);
++	struct kfd_event *ev;
++
++	hash_for_each_possible(p->events, ev, events, id)
++		if (ev->event_id == id)
++			return ev;
++
++	return NULL;
+ }
+ 
+-/**
+- * lookup_signaled_event_by_partial_id - Lookup signaled event from partial ID
+- * @p:     Pointer to struct kfd_process
+- * @id:    ID to look up
+- * @bits:  Number of valid bits in @id
+- *
+- * Finds the first signaled event with a matching partial ID. If no
+- * matching signaled event is found, returns NULL. In that case the
+- * caller should assume that the partial ID is invalid and do an
+- * exhaustive search of all siglaned events.
+- *
+- * If multiple events with the same partial ID signal at the same
+- * time, they will be found one interrupt at a time, not necessarily
+- * in the same order the interrupts occurred. As long as the number of
+- * interrupts is correct, all signaled events will be seen by the
+- * driver.
++static u32 make_signal_event_id(struct signal_page *page,
++					 unsigned int signal_slot_index)
++{
++	return page->page_index |
++			(signal_slot_index << SIGNAL_EVENT_ID_SLOT_SHIFT);
++}
++
++/*
++ * Produce a kfd event id for a nonsignal event.
++ * These are arbitrary numbers, so we do a sequential search through
++ * the hash table for an unused number.
+  */
+-static struct kfd_event *lookup_signaled_event_by_partial_id(
+-	struct kfd_process *p, uint32_t id, uint32_t bits)
++static u32 make_nonsignal_event_id(struct kfd_process *p)
+ {
+-	struct kfd_event *ev;
++	u32 id;
+ 
+-	if (!p->signal_page || id >= KFD_SIGNAL_EVENT_LIMIT)
+-		return NULL;
++	for (id = p->next_nonsignal_event_id;
++		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
++		lookup_event_by_id(p, id);
++		id++)
++		;
+ 
+-	/* Fast path for the common case that @id is not a partial ID
+-	 * and we only need a single lookup.
+-	 */
+-	if (bits > 31 || (1U << bits) >= KFD_SIGNAL_EVENT_LIMIT) {
+-		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+-			return NULL;
++	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
++
++		/*
++		 * What if id == LAST_NONSIGNAL_EVENT_ID - 1?
++		 * Then next_nonsignal_event_id = LAST_NONSIGNAL_EVENT_ID so
++		 * the first loop fails immediately and we proceed with the
++		 * wraparound loop below.
++		 */
++		p->next_nonsignal_event_id = id + 1;
+ 
+-		return idr_find(&p->event_idr, id);
++		return id;
+ 	}
+ 
+-	/* General case for partial IDs: Iterate over all matching IDs
+-	 * and find the first one that has signaled.
+-	 */
+-	for (ev = NULL; id < KFD_SIGNAL_EVENT_LIMIT && !ev; id += 1U << bits) {
+-		if (page_slots(p->signal_page)[id] == UNSIGNALED_EVENT_SLOT)
+-			continue;
++	for (id = KFD_FIRST_NONSIGNAL_EVENT_ID;
++		id < KFD_LAST_NONSIGNAL_EVENT_ID &&
++		lookup_event_by_id(p, id);
++		id++)
++		;
++
+ 
+-		ev = idr_find(&p->event_idr, id);
++	if (id < KFD_LAST_NONSIGNAL_EVENT_ID) {
++		p->next_nonsignal_event_id = id + 1;
++		return id;
+ 	}
+ 
+-	return ev;
++	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
++	return 0;
++}
++
++static struct kfd_event *lookup_event_by_page_slot(struct kfd_process *p,
++						struct signal_page *page,
++						unsigned int signal_slot)
++{
++	return lookup_event_by_id(p, make_signal_event_id(page, signal_slot));
+ }
+ 
+ static int create_signal_event(struct file *devkfd,
+ 				struct kfd_process *p,
+ 				struct kfd_event *ev)
+ {
+-	int ret;
+-
+ 	if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) {
+-		if (!p->signal_event_limit_reached) {
+-			pr_warn("Signal event wasn't created because limit was reached\n");
+-			p->signal_event_limit_reached = true;
+-		}
+-		return -ENOSPC;
++		pr_warn("Signal event wasn't created because limit was reached\n");
++		return -ENOMEM;
+ 	}
+ 
+-	ret = allocate_event_notification_slot(p, ev);
+-	if (ret) {
++	if (!allocate_event_notification_slot(devkfd, p, &ev->signal_page,
++						&ev->signal_slot_index)) {
+ 		pr_warn("Signal event wasn't created because out of kernel memory\n");
+-		return ret;
++		return -ENOMEM;
+ 	}
+ 
+ 	p->signal_event_count++;
+ 
+-	ev->user_signal_address = &p->signal_page->user_address[ev->event_id];
++	ev->user_signal_address =
++			&ev->signal_page->user_address[ev->signal_slot_index];
++
++	ev->event_id = make_signal_event_id(ev->signal_page,
++						ev->signal_slot_index);
++
+ 	pr_debug("Signal event number %zu created with id %d, address %p\n",
+ 			p->signal_event_count, ev->event_id,
+ 			ev->user_signal_address);
+@@ -221,20 +317,16 @@ static int create_signal_event(struct file *devkfd,
+ 	return 0;
+ }
+ 
++/*
++ * No non-signal events are supported yet.
++ * We create them as events that never signal.
++ * Set event calls from user-mode are failed.
++ */
+ static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
+ {
+-	/* Cast KFD_LAST_NONSIGNAL_EVENT to uint32_t. This allows an
+-	 * intentional integer overflow to -1 without a compiler
+-	 * warning. idr_alloc treats a negative value as "maximum
+-	 * signed integer".
+-	 */
+-	int id = idr_alloc(&p->event_idr, ev, KFD_FIRST_NONSIGNAL_EVENT_ID,
+-			   (uint32_t)KFD_LAST_NONSIGNAL_EVENT_ID + 1,
+-			   GFP_KERNEL);
+-
+-	if (id < 0)
+-		return id;
+-	ev->event_id = id;
++	ev->event_id = make_nonsignal_event_id(p);
++	if (ev->event_id == 0)
++		return -ENOMEM;
+ 
+ 	return 0;
+ }
+@@ -242,50 +334,52 @@ static int create_other_event(struct kfd_process *p, struct kfd_event *ev)
+ void kfd_event_init_process(struct kfd_process *p)
+ {
+ 	mutex_init(&p->event_mutex);
+-	idr_init(&p->event_idr);
+-	p->signal_page = NULL;
++	hash_init(p->events);
++	INIT_LIST_HEAD(&p->signal_event_pages);
++	p->next_nonsignal_event_id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+ 	p->signal_event_count = 0;
+ }
+ 
+ static void destroy_event(struct kfd_process *p, struct kfd_event *ev)
+ {
+-	struct kfd_event_waiter *waiter;
+-
+-	/* Wake up pending waiters. They will return failure */
+-	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
+-		waiter->event = NULL;
+-	wake_up_all(&ev->wq);
+-
+-	if (ev->type == KFD_EVENT_TYPE_SIGNAL ||
+-	    ev->type == KFD_EVENT_TYPE_DEBUG)
++	if (ev->signal_page) {
++		release_event_notification_slot(ev->signal_page,
++						ev->signal_slot_index);
+ 		p->signal_event_count--;
++	}
++
++	/*
++	 * Abandon the list of waiters. Individual waiting threads will
++	 * clean up their own data.
++	 */
++	list_del(&ev->waiters);
+ 
+-	idr_remove(&p->event_idr, ev->event_id);
++	hash_del(&ev->events);
+ 	kfree(ev);
+ }
+ 
+ static void destroy_events(struct kfd_process *p)
+ {
+ 	struct kfd_event *ev;
+-	uint32_t id;
++	struct hlist_node *tmp;
++	unsigned int hash_bkt;
+ 
+-	idr_for_each_entry(&p->event_idr, ev, id)
++	hash_for_each_safe(p->events, hash_bkt, tmp, ev, events)
+ 		destroy_event(p, ev);
+-	idr_destroy(&p->event_idr);
+ }
+ 
+ /*
+  * We assume that the process is being destroyed and there is no need to
+  * unmap the pages or keep bookkeeping data in order.
+  */
+-static void shutdown_signal_page(struct kfd_process *p)
++static void shutdown_signal_pages(struct kfd_process *p)
+ {
+-	struct kfd_signal_page *page = p->signal_page;
++	struct signal_page *page, *tmp;
+ 
+-	if (page) {
+-		if (page->user_address)
+-			free_pages((unsigned long)page->kernel_address,
+-					get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
++	list_for_each_entry_safe(page, tmp, &p->signal_event_pages,
++					event_pages) {
++		free_pages((unsigned long)page->kernel_address,
++				get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
+ 		kfree(page);
+ 	}
+ }
+@@ -293,7 +387,7 @@ static void shutdown_signal_page(struct kfd_process *p)
+ void kfd_event_free_process(struct kfd_process *p)
+ {
+ 	destroy_events(p);
+-	shutdown_signal_page(p);
++	shutdown_signal_pages(p);
+ }
+ 
+ static bool event_can_be_gpu_signaled(const struct kfd_event *ev)
+@@ -310,8 +404,7 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+ 		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index,
+-		     void *kern_addr)
++		     uint64_t *event_page_offset, uint32_t *event_slot_index)
+ {
+ 	int ret = 0;
+ 	struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+@@ -323,29 +416,21 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 	ev->auto_reset = auto_reset;
+ 	ev->signaled = false;
+ 
+-	init_waitqueue_head(&ev->wq);
+-
+-	mutex_lock(&p->event_mutex);
+-
+-	if (kern_addr && !p->signal_page) {
+-		p->signal_page = allocate_signal_page_dgpu(p, kern_addr,
+-							   *event_page_offset);
+-		if (!p->signal_page) {
+-			ret = -ENOMEM;
+-			goto out;
+-		}
+-	}
++	INIT_LIST_HEAD(&ev->waiters);
+ 
+ 	*event_page_offset = 0;
+ 
++	mutex_lock(&p->event_mutex);
++
+ 	switch (event_type) {
+ 	case KFD_EVENT_TYPE_SIGNAL:
+ 	case KFD_EVENT_TYPE_DEBUG:
+ 		ret = create_signal_event(devkfd, p, ev);
+ 		if (!ret) {
+-			*event_page_offset = KFD_MMAP_TYPE_EVENTS;
++			*event_page_offset = (ev->signal_page->page_index |
++					KFD_MMAP_EVENTS_MASK);
+ 			*event_page_offset <<= PAGE_SHIFT;
+-			*event_slot_index = ev->event_id;
++			*event_slot_index = ev->signal_slot_index;
+ 		}
+ 		break;
+ 	default:
+@@ -354,13 +439,14 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 	}
+ 
+ 	if (!ret) {
++		hash_add(p->events, &ev->events, ev->event_id);
++
+ 		*event_id = ev->event_id;
+ 		*event_trigger_data = ev->event_id;
+ 	} else {
+ 		kfree(ev);
+ 	}
+ 
+-out:
+ 	mutex_unlock(&p->event_mutex);
+ 
+ 	return ret;
+@@ -388,14 +474,19 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id)
+ static void set_event(struct kfd_event *ev)
+ {
+ 	struct kfd_event_waiter *waiter;
++	struct kfd_event_waiter *next;
+ 
+ 	/* Auto reset if the list is non-empty and we're waking someone. */
+-	ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq);
++	ev->signaled = !ev->auto_reset || list_empty(&ev->waiters);
+ 
+-	list_for_each_entry(waiter, &ev->wq.head, wait.entry)
++	list_for_each_entry_safe(waiter, next, &ev->waiters, waiters) {
+ 		waiter->activated = true;
+ 
+-	wake_up_all(&ev->wq);
++		/* _init because free_waiters will call list_del */
++		list_del_init(&waiter->waiters);
++
++		wake_up_process(waiter->sleeping_task);
++	}
+ }
+ 
+ /* Assumes that p is current. */
+@@ -444,7 +535,13 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id)
+ 
+ static void acknowledge_signal(struct kfd_process *p, struct kfd_event *ev)
+ {
+-	page_slots(p->signal_page)[ev->event_id] = UNSIGNALED_EVENT_SLOT;
++	page_slots(ev->signal_page)[ev->signal_slot_index] =
++						UNSIGNALED_EVENT_SLOT;
++}
++
++static bool is_slot_signaled(struct signal_page *page, unsigned int index)
++{
++	return page_slots(page)[index] != UNSIGNALED_EVENT_SLOT;
+ }
+ 
+ static void set_event_from_interrupt(struct kfd_process *p,
+@@ -459,12 +556,12 @@ static void set_event_from_interrupt(struct kfd_process *p,
+ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+ 				uint32_t valid_id_bits)
+ {
+-	struct kfd_event *ev = NULL;
++	struct kfd_event *ev;
+ 
+ 	/*
+ 	 * Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function increments the process ref count.
++	 * running so the lookup function returns a locked process.
+ 	 */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ 
+@@ -473,50 +570,30 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+ 
+ 	mutex_lock(&p->event_mutex);
+ 
+-	if (valid_id_bits)
+-		ev = lookup_signaled_event_by_partial_id(p, partial_id,
+-							 valid_id_bits);
+-	if (ev) {
++	if (valid_id_bits >= INTERRUPT_DATA_BITS) {
++		/* Partial ID is a full ID. */
++		ev = lookup_event_by_id(p, partial_id);
+ 		set_event_from_interrupt(p, ev);
+-	} else if (p->signal_page) {
++	} else {
+ 		/*
+-		 * Partial ID lookup failed. Assume that the event ID
+-		 * in the interrupt payload was invalid and do an
+-		 * exhaustive search of signaled events.
++		 * Partial ID is in fact partial. For now we completely
++		 * ignore it, but we could use any bits we did receive to
++		 * search faster.
+ 		 */
+-		uint64_t *slots = page_slots(p->signal_page);
+-		uint32_t id;
+-
+-		if (valid_id_bits)
+-			pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
+-					     partial_id, valid_id_bits);
+-
+-		if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/64) {
+-			/* With relatively few events, it's faster to
+-			 * iterate over the event IDR
+-			 */
+-			idr_for_each_entry(&p->event_idr, ev, id) {
+-				if (id >= KFD_SIGNAL_EVENT_LIMIT)
+-					break;
+-
+-				if (slots[id] != UNSIGNALED_EVENT_SLOT)
+-					set_event_from_interrupt(p, ev);
+-			}
+-		} else {
+-			/* With relatively many events, it's faster to
+-			 * iterate over the signal slots and lookup
+-			 * only signaled events from the IDR.
+-			 */
+-			for (id = 0; id < KFD_SIGNAL_EVENT_LIMIT; id++)
+-				if (slots[id] != UNSIGNALED_EVENT_SLOT) {
+-					ev = lookup_event_by_id(p, id);
++		struct signal_page *page;
++		unsigned int i;
++
++		list_for_each_entry(page, &p->signal_event_pages, event_pages)
++			for (i = 0; i < SLOTS_PER_PAGE; i++)
++				if (is_slot_signaled(page, i)) {
++					ev = lookup_event_by_page_slot(p,
++								page, i);
+ 					set_event_from_interrupt(p, ev);
+ 				}
+-		}
+ 	}
+ 
+ 	mutex_unlock(&p->event_mutex);
+-	kfd_unref_process(p);
++	mutex_unlock(&p->mutex);
+ }
+ 
+ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
+@@ -529,16 +606,18 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
+ 					GFP_KERNEL);
+ 
+ 	for (i = 0; (event_waiters) && (i < num_events) ; i++) {
+-		init_wait(&event_waiters[i].wait);
++		INIT_LIST_HEAD(&event_waiters[i].waiters);
++		event_waiters[i].sleeping_task = current;
+ 		event_waiters[i].activated = false;
+ 	}
+ 
+ 	return event_waiters;
+ }
+ 
+-static int init_event_waiter_get_status(struct kfd_process *p,
++static int init_event_waiter(struct kfd_process *p,
+ 		struct kfd_event_waiter *waiter,
+-		uint32_t event_id)
++		uint32_t event_id,
++		uint32_t input_index)
+ {
+ 	struct kfd_event *ev = lookup_event_by_id(p, event_id);
+ 
+@@ -546,60 +625,38 @@ static int init_event_waiter_get_status(struct kfd_process *p,
+ 		return -EINVAL;
+ 
+ 	waiter->event = ev;
++	waiter->input_index = input_index;
+ 	waiter->activated = ev->signaled;
+ 	ev->signaled = ev->signaled && !ev->auto_reset;
+ 
+-	return 0;
+-}
++	list_add(&waiter->waiters, &ev->waiters);
+ 
+-static void init_event_waiter_add_to_waitlist(struct kfd_event_waiter *waiter)
+-{
+-	struct kfd_event *ev = waiter->event;
+-
+-	/* Only add to the wait list if we actually need to
+-	 * wait on this event.
+-	 */
+-	if (!waiter->activated)
+-		add_wait_queue(&ev->wq, &waiter->wait);
++	return 0;
+ }
+ 
+-/* test_event_condition - Test condition of events being waited for
+- * @all:           Return completion only if all events have signaled
+- * @num_events:    Number of events to wait for
+- * @event_waiters: Array of event waiters, one per event
+- *
+- * Returns KFD_IOC_WAIT_RESULT_COMPLETE if all (or one) event(s) have
+- * signaled. Returns KFD_IOC_WAIT_RESULT_TIMEOUT if no (or not all)
+- * events have signaled. Returns KFD_IOC_WAIT_RESULT_FAIL if any of
+- * the events have been destroyed.
+- */
+-static uint32_t test_event_condition(bool all, uint32_t num_events,
++static bool test_event_condition(bool all, uint32_t num_events,
+ 				struct kfd_event_waiter *event_waiters)
+ {
+ 	uint32_t i;
+ 	uint32_t activated_count = 0;
+ 
+ 	for (i = 0; i < num_events; i++) {
+-		if (!event_waiters[i].event)
+-			return KFD_IOC_WAIT_RESULT_FAIL;
+-
+ 		if (event_waiters[i].activated) {
+ 			if (!all)
+-				return KFD_IOC_WAIT_RESULT_COMPLETE;
++				return true;
+ 
+ 			activated_count++;
+ 		}
+ 	}
+ 
+-	return activated_count == num_events ?
+-		KFD_IOC_WAIT_RESULT_COMPLETE : KFD_IOC_WAIT_RESULT_TIMEOUT;
++	return activated_count == num_events;
+ }
+ 
+ /*
+  * Copy event specific data, if defined.
+  * Currently only memory exception events have additional data to copy to user
+  */
+-static int copy_signaled_event_data(uint32_t num_events,
++static bool copy_signaled_event_data(uint32_t num_events,
+ 		struct kfd_event_waiter *event_waiters,
+ 		struct kfd_event_data __user *data)
+ {
+@@ -613,15 +670,15 @@ static int copy_signaled_event_data(uint32_t num_events,
+ 		waiter = &event_waiters[i];
+ 		event = waiter->event;
+ 		if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
+-			dst = &data[i].memory_exception_data;
++			dst = &data[waiter->input_index].memory_exception_data;
+ 			src = &event->memory_exception_data;
+ 			if (copy_to_user(dst, src,
+ 				sizeof(struct kfd_hsa_memory_exception_data)))
+-				return -EFAULT;
++				return false;
+ 		}
+ 	}
+ 
+-	return 0;
++	return true;
+ 
+ }
+ 
+@@ -650,9 +707,7 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
+ 	uint32_t i;
+ 
+ 	for (i = 0; i < num_events; i++)
+-		if (waiters[i].event)
+-			remove_wait_queue(&waiters[i].event->wq,
+-					  &waiters[i].wait);
++		list_del(&waiters[i].waiters);
+ 
+ 	kfree(waiters);
+ }
+@@ -660,56 +715,38 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
+ int kfd_wait_on_events(struct kfd_process *p,
+ 		       uint32_t num_events, void __user *data,
+ 		       bool all, uint32_t user_timeout_ms,
+-		       uint32_t *wait_result)
++		       enum kfd_event_wait_result *wait_result)
+ {
+ 	struct kfd_event_data __user *events =
+ 			(struct kfd_event_data __user *) data;
+ 	uint32_t i;
+ 	int ret = 0;
+-
+ 	struct kfd_event_waiter *event_waiters = NULL;
+ 	long timeout = user_timeout_to_jiffies(user_timeout_ms);
+ 
++	mutex_lock(&p->event_mutex);
++
+ 	event_waiters = alloc_event_waiters(num_events);
+ 	if (!event_waiters) {
+ 		ret = -ENOMEM;
+-		goto out;
++		goto fail;
+ 	}
+ 
+-	mutex_lock(&p->event_mutex);
+-
+ 	for (i = 0; i < num_events; i++) {
+ 		struct kfd_event_data event_data;
+ 
+ 		if (copy_from_user(&event_data, &events[i],
+ 				sizeof(struct kfd_event_data))) {
+ 			ret = -EFAULT;
+-			goto out_unlock;
++			goto fail;
+ 		}
+ 
+-		ret = init_event_waiter_get_status(p, &event_waiters[i],
+-				event_data.event_id);
++		ret = init_event_waiter(p, &event_waiters[i],
++				event_data.event_id, i);
+ 		if (ret)
+-			goto out_unlock;
+-	}
+-
+-	/* Check condition once. */
+-	*wait_result = test_event_condition(all, num_events, event_waiters);
+-	if (*wait_result == KFD_IOC_WAIT_RESULT_COMPLETE) {
+-		ret = copy_signaled_event_data(num_events,
+-					       event_waiters, events);
+-		goto out_unlock;
+-	} else if (WARN_ON(*wait_result == KFD_IOC_WAIT_RESULT_FAIL)) {
+-		/* This should not happen. Events shouldn't be
+-		 * destroyed while we're holding the event_mutex
+-		 */
+-		goto out_unlock;
++			goto fail;
+ 	}
+ 
+-	/* Add to wait lists if we need to wait. */
+-	for (i = 0; i < num_events; i++)
+-		init_event_waiter_add_to_waitlist(&event_waiters[i]);
+-
+ 	mutex_unlock(&p->event_mutex);
+ 
+ 	while (true) {
+@@ -731,45 +768,37 @@ int kfd_wait_on_events(struct kfd_process *p,
+ 			break;
+ 		}
+ 
+-		/* Set task state to interruptible sleep before
+-		 * checking wake-up conditions. A concurrent wake-up
+-		 * will put the task back into runnable state. In that
+-		 * case schedule_timeout will not put the task to
+-		 * sleep and we'll get a chance to re-check the
+-		 * updated conditions almost immediately. Otherwise,
+-		 * this race condition would lead to a soft hang or a
+-		 * very long sleep.
+-		 */
+-		set_current_state(TASK_INTERRUPTIBLE);
+-
+-		*wait_result = test_event_condition(all, num_events,
+-						    event_waiters);
+-		if (*wait_result != KFD_IOC_WAIT_RESULT_TIMEOUT)
++		if (test_event_condition(all, num_events, event_waiters)) {
++			if (copy_signaled_event_data(num_events,
++					event_waiters, events))
++				*wait_result = KFD_WAIT_COMPLETE;
++			else
++				*wait_result = KFD_WAIT_ERROR;
+ 			break;
++		}
+ 
+-		if (timeout <= 0)
++		if (timeout <= 0) {
++			*wait_result = KFD_WAIT_TIMEOUT;
+ 			break;
++		}
+ 
+-		timeout = schedule_timeout(timeout);
++		timeout = schedule_timeout_interruptible(timeout);
+ 	}
+ 	__set_current_state(TASK_RUNNING);
+ 
+-	/* copy_signaled_event_data may sleep. So this has to happen
+-	 * after the task state is set back to RUNNING.
+-	 */
+-	if (!ret && *wait_result == KFD_IOC_WAIT_RESULT_COMPLETE)
+-		ret = copy_signaled_event_data(num_events,
+-					       event_waiters, events);
+-
+ 	mutex_lock(&p->event_mutex);
+-out_unlock:
+ 	free_waiters(num_events, event_waiters);
+ 	mutex_unlock(&p->event_mutex);
+-out:
+-	if (ret)
+-		*wait_result = KFD_IOC_WAIT_RESULT_FAIL;
+-	else if (*wait_result == KFD_IOC_WAIT_RESULT_FAIL)
+-		ret = -EIO;
++
++	return ret;
++
++fail:
++	if (event_waiters)
++		free_waiters(num_events, event_waiters);
++
++	mutex_unlock(&p->event_mutex);
++
++	*wait_result = KFD_WAIT_ERROR;
+ 
+ 	return ret;
+ }
+@@ -777,8 +806,9 @@ int kfd_wait_on_events(struct kfd_process *p,
+ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ {
+ 
++	unsigned int page_index;
+ 	unsigned long pfn;
+-	struct kfd_signal_page *page;
++	struct signal_page *page;
+ 
+ 	/* check required size is logical */
+ 	if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) !=
+@@ -787,10 +817,13 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
+ 		return -EINVAL;
+ 	}
+ 
+-	page = p->signal_page;
++	page_index = vma->vm_pgoff;
++
++	page = lookup_signal_page_by_index(p, page_index);
+ 	if (!page) {
+ 		/* Probably KFD bug, but mmap is user-accessible. */
+-		pr_debug("Signal page could not be found\n");
++		pr_debug("Signal page could not be found for page_index %u\n",
++				page_index);
+ 		return -EINVAL;
+ 	}
+ 
+@@ -824,13 +857,12 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ {
+ 	struct kfd_hsa_memory_exception_data *ev_data;
+ 	struct kfd_event *ev;
+-	uint32_t id;
++	int bkt;
+ 	bool send_signal = true;
+ 
+ 	ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+ 
+-	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+-	idr_for_each_entry_continue(&p->event_idr, ev, id)
++	hash_for_each(p->events, bkt, ev, events)
+ 		if (ev->type == type) {
+ 			send_signal = false;
+ 			dev_dbg(kfd_device,
+@@ -841,13 +873,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ 				ev->memory_exception_data = *ev_data;
+ 		}
+ 
+-	if (type == KFD_EVENT_TYPE_MEMORY) {
+-		dev_warn(kfd_device,
+-			"Sending SIGSEGV to HSA Process with PID %d ",
+-				p->lead_thread->pid);
+-		send_sig(SIGSEGV, p->lead_thread, 0);
+-	}
+-
+ 	/* Send SIGTERM no event of type "type" has been found*/
+ 	if (send_signal) {
+ 		if (send_sigterm) {
+@@ -863,7 +888,6 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
+ 	}
+ }
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 		unsigned long address, bool is_write_requested,
+ 		bool is_execute_requested)
+@@ -874,27 +898,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 	/*
+ 	 * Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function increments the process ref count.
++	 * running so the lookup function returns a locked process.
+ 	 */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+-	struct mm_struct *mm;
+ 
+ 	if (!p)
+ 		return; /* Presumably process exited. */
+ 
+-	/* Take a safe reference to the mm_struct, which may otherwise
+-	 * disappear even while the kfd_process is still referenced.
+-	 */
+-	mm = get_task_mm(p->lead_thread);
+-	if (!mm) {
+-		kfd_unref_process(p);
+-		return; /* Process is exiting */
+-	}
+-
+ 	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ 
+-	down_read(&mm->mmap_sem);
+-	vma = find_vma(mm, address);
++	down_read(&p->mm->mmap_sem);
++	vma = find_vma(p->mm, address);
+ 
+ 	memory_exception_data.gpu_id = dev->id;
+ 	memory_exception_data.va = address;
+@@ -920,8 +934,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 		}
+ 	}
+ 
+-	up_read(&mm->mmap_sem);
+-	mmput(mm);
++	up_read(&p->mm->mmap_sem);
+ 
+ 	mutex_lock(&p->event_mutex);
+ 
+@@ -930,17 +943,15 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+ 			&memory_exception_data);
+ 
+ 	mutex_unlock(&p->event_mutex);
+-
+-	kfd_unref_process(p);
++	mutex_unlock(&p->mutex);
+ }
+-#endif /* CONFIG_AMD_IOMMU_V2_MODULE */
+ 
+ void kfd_signal_hw_exception_event(unsigned int pasid)
+ {
+ 	/*
+ 	 * Because we are called from arbitrary context (workqueue) as opposed
+ 	 * to process context, kfd_process could attempt to exit while we are
+-	 * running so the lookup function increments the process ref count.
++	 * running so the lookup function returns a locked process.
+ 	 */
+ 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ 
+@@ -953,42 +964,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
+ 	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
+ 
+ 	mutex_unlock(&p->event_mutex);
+-	kfd_unref_process(p);
+-}
+-
+-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+-				struct kfd_vm_fault_info *info)
+-{
+-	struct kfd_event *ev;
+-	uint32_t id;
+-	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+-	struct kfd_hsa_memory_exception_data memory_exception_data;
+-
+-	if (!p)
+-		return; /* Presumably process exited. */
+-	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+-	memory_exception_data.gpu_id = dev->id;
+-	memory_exception_data.failure.imprecise = true;
+-	/* Set failure reason */
+-	if (info) {
+-		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+-		memory_exception_data.failure.NotPresent =
+-			info->prot_valid ? 1 : 0;
+-		memory_exception_data.failure.NoExecute =
+-			info->prot_exec ? 1 : 0;
+-		memory_exception_data.failure.ReadOnly =
+-			info->prot_write ? 1 : 0;
+-		memory_exception_data.failure.imprecise = 0;
+-	}
+-	mutex_lock(&p->event_mutex);
+-
+-	id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+-	idr_for_each_entry_continue(&p->event_idr, ev, id)
+-		if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+-			ev->memory_exception_data = memory_exception_data;
+-			set_event(ev);
+-		}
+-
+-	mutex_unlock(&p->event_mutex);
+-	kfd_unref_process(p);
++	mutex_unlock(&p->mutex);
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+index abca5bf..28f6838 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h
+@@ -27,17 +27,12 @@
+ #include <linux/hashtable.h>
+ #include <linux/types.h>
+ #include <linux/list.h>
+-#include <linux/wait.h>
+ #include "kfd_priv.h"
+ #include <uapi/linux/kfd_ioctl.h>
+ 
+-/*
+- * IDR supports non-negative integer IDs. Small IDs are used for
+- * signal events to match their signal slot. Use the upper half of the
+- * ID space for non-signal events.
+- */
+-#define KFD_FIRST_NONSIGNAL_EVENT_ID ((INT_MAX >> 1) + 1)
+-#define KFD_LAST_NONSIGNAL_EVENT_ID INT_MAX
++#define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
++#define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
++#define KFD_LAST_NONSIGNAL_EVENT_ID UINT_MAX
+ 
+ /*
+  * Written into kfd_signal_slot_t to indicate that the event is not signaled.
+@@ -51,6 +46,9 @@ struct kfd_event_waiter;
+ struct signal_page;
+ 
+ struct kfd_event {
++	/* All events in process, rooted at kfd_process.events. */
++	struct hlist_node events;
++
+ 	u32 event_id;
+ 
+ 	bool signaled;
+@@ -58,9 +56,11 @@ struct kfd_event {
+ 
+ 	int type;
+ 
+-	wait_queue_head_t wq; /* List of event waiters. */
++	struct list_head waiters; /* List of kfd_event_waiter by waiters. */
+ 
+ 	/* Only for signal events. */
++	struct signal_page *signal_page;
++	unsigned int signal_slot_index;
+ 	uint64_t __user *user_signal_address;
+ 
+ 	/* type specific data */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+index 499efa1..c59384b 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+@@ -275,80 +275,24 @@
+  * for FLAT_* / S_LOAD operations.
+  */
+ 
+-#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \
++#define MAKE_GPUVM_APP_BASE(gpu_num) \
+ 	(((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
+ 
+-#define MAKE_GPUVM_APP_LIMIT(base, size) \
+-	(((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
++#define MAKE_GPUVM_APP_LIMIT(base) \
++	(((uint64_t)(base) & \
++		0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL)
+ 
+-#define MAKE_SCRATCH_APP_BASE_VI() \
+-	(((uint64_t)(0x1UL) << 61) + 0x100000000L)
++#define MAKE_SCRATCH_APP_BASE(gpu_num) \
++	(((uint64_t)(gpu_num) << 61) + 0x100000000L)
+ 
+ #define MAKE_SCRATCH_APP_LIMIT(base) \
+ 	(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+ 
+-#define MAKE_LDS_APP_BASE_VI() \
+-	(((uint64_t)(0x1UL) << 61) + 0x0)
+-
++#define MAKE_LDS_APP_BASE(gpu_num) \
++	(((uint64_t)(gpu_num) << 61) + 0x0)
+ #define MAKE_LDS_APP_LIMIT(base) \
+ 	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
+ 
+-/* On GFXv9 the LDS and scratch apertures are programmed independently
+- * using the high 16 bits of the 64-bit virtual address. They must be
+- * in the hole, which will be the case as long as the high 16 bits are
+- * not 0.
+- *
+- * The aperture sizes are still 4GB implicitly.
+- *
+- * A GPUVM aperture is not applicable on GFXv9.
+- */
+-#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48)
+-#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48)
+-
+-/* Some VM address space reserved for kernel use (CWSR trap handlers
+- * and kernel IBs)
+- */
+-#define DGPU_VM_BASE_DEFAULT 0x100000
+-#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE)
+-
+-int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
+-					uint64_t base, uint64_t limit)
+-{
+-	if (base < (pdd->qpd.cwsr_base + KFD_CWSR_TBA_TMA_SIZE)) {
+-		pr_err("Set dgpu vm base 0x%llx failed.\n", base);
+-		return -EINVAL;
+-	}
+-	pdd->dgpu_base = base;
+-	pdd->dgpu_limit = limit;
+-	return 0;
+-}
+-
+-void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id)
+-{
+-	/*
+-	 * node id couldn't be 0 - the three MSB bits of
+-	 * aperture shoudn't be 0
+-	 */
+-	pdd->lds_base = MAKE_LDS_APP_BASE_VI();
+-	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+-
+-	pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1);
+-	pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
+-		pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size);
+-
+-	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI();
+-	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+-}
+-
+-void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id)
+-{
+-	pdd->lds_base = MAKE_LDS_APP_BASE_V9();
+-	pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
+-
+-	pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9();
+-	pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+-}
+-
+ int kfd_init_apertures(struct kfd_process *process)
+ {
+ 	uint8_t id  = 0;
+@@ -356,11 +300,8 @@ int kfd_init_apertures(struct kfd_process *process)
+ 	struct kfd_process_device *pdd;
+ 
+ 	/*Iterating over all devices*/
+-	while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
+-		if (!dev) {
+-			id++; /* Skip non GPU devices */
+-			continue;
+-		}
++	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
++		id < NUM_OF_SUPPORTED_GPUS) {
+ 
+ 		pdd = kfd_create_process_device_data(dev, process);
+ 		if (!pdd) {
+@@ -377,29 +318,23 @@ int kfd_init_apertures(struct kfd_process *process)
+ 			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
+ 			pdd->scratch_base = pdd->scratch_limit = 0;
+ 		} else {
+-			switch (dev->device_info->asic_family) {
+-			case CHIP_KAVERI:
+-			case CHIP_HAWAII:
+-			case CHIP_CARRIZO:
+-			case CHIP_TONGA:
+-			case CHIP_FIJI:
+-			case CHIP_POLARIS10:
+-			case CHIP_POLARIS11:
+-				kfd_init_apertures_vi(pdd, id);
+-				break;
+-			case CHIP_VEGA10:
+-			case CHIP_RAVEN:
+-				kfd_init_apertures_v9(pdd, id);
+-				break;
+-			default:
+-				pr_err("Unknown chip in kfd_init_apertures\n");
+-				return -1;
+-			}
++			/*
++			 * node id couldn't be 0 - the three MSB bits of
++			 * aperture shoudn't be 0
++			 */
++			pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
++
++			pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
++
++			pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
++
++			pdd->gpuvm_limit =
++					MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
+ 
+-			if (!dev->device_info->is_need_iommu_device) {
+-				pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
+-				pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT;
+-			}
++			pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
++
++			pdd->scratch_limit =
++				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+ 		}
+ 
+ 		dev_dbg(kfd_device, "node id %u\n", id);
+@@ -416,3 +351,5 @@ int kfd_init_apertures(struct kfd_process *process)
+ 
+ 	return 0;
+ }
++
++
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+deleted file mode 100644
+index 009d6f4..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
++++ /dev/null
+@@ -1,135 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#include "kfd_priv.h"
+-#include "kfd_events.h"
+-#include "soc15_int.h"
+-
+-
+-static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid)
+-{
+-	uint32_t pasid = 0;
+-	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+-
+-	if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid))
+-		pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);
+-
+-	return pasid;
+-}
+-
+-static bool event_interrupt_isr_v9(struct kfd_dev *dev,
+-					const uint32_t *ih_ring_entry,
+-					uint32_t *patched_ihre,
+-					bool *patched_flag)
+-{
+-	uint16_t source_id, client_id, pasid, vmid;
+-	bool result = false;
+-
+-	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+-	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+-	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+-	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+-
+-	if (pasid) {
+-		const uint32_t *data = ih_ring_entry;
+-
+-		pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n",
+-			 client_id, source_id, pasid);
+-		pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+-			 data[0], data[1], data[2], data[3],
+-			 data[4], data[5], data[6], data[7]);
+-	}
+-
+-	if ((vmid >= dev->vm_info.first_vmid_kfd &&
+-	     vmid <= dev->vm_info.last_vmid_kfd) &&
+-	    (source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
+-	     source_id == SOC15_INTSRC_SDMA_TRAP ||
+-	     source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
+-	     source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+-	     client_id == SOC15_IH_CLIENTID_VMC ||
+-	     client_id == SOC15_IH_CLIENTID_UTCL2)) {
+-
+-		/*
+-		 * KFD want to handle this INT, but MEC firmware did
+-		 * not send pasid. Try to get it from vmid mapping
+-		 * and patch the ih entry. It's a temp workaround.
+-		 */
+-		WARN_ONCE((!pasid), "Fix me.\n");
+-		if (!pasid) {
+-			uint32_t temp = le32_to_cpu(ih_ring_entry[3]);
+-
+-			pasid = kfd_get_pasid_from_vmid(dev, vmid);
+-			memcpy(patched_ihre, ih_ring_entry,
+-			       dev->device_info->ih_ring_entry_size);
+-			patched_ihre[3] = cpu_to_le32(temp | pasid);
+-			*patched_flag = true;
+-		}
+-		result = pasid ? true : false;
+-	}
+-
+-	/* Do not process in ISR, just request it to be forwarded to WQ. */
+-	return result;
+-
+-}
+-
+-static void event_interrupt_wq_v9(struct kfd_dev *dev,
+-					const uint32_t *ih_ring_entry)
+-{
+-	uint16_t source_id, client_id, pasid, vmid;
+-	uint32_t context_id;
+-
+-	source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
+-	client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
+-	pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
+-	vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
+-	context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+-
+-	if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
+-		kfd_signal_event_interrupt(pasid, context_id, 32);
+-	else if (source_id == SOC15_INTSRC_SDMA_TRAP)
+-		kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28);
+-	else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG)
+-		kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24);
+-	else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
+-		kfd_signal_hw_exception_event(pasid);
+-	else if (client_id == SOC15_IH_CLIENTID_VMC ||
+-		 client_id == SOC15_IH_CLIENTID_UTCL2) {
+-		struct kfd_vm_fault_info info = {0};
+-		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+-
+-		info.vmid = vmid;
+-		info.mc_id = client_id;
+-		info.page_addr = ih_ring_entry[4] |
+-			(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+-		info.prot_valid = ring_id & 0x08;
+-		info.prot_read  = ring_id & 0x10;
+-		info.prot_write = ring_id & 0x20;
+-
+-		kfd_process_vm_fault(dev->dqm, pasid);
+-		kfd_signal_vm_fault_event(dev, pasid, &info);
+-	}
+-}
+-
+-const struct kfd_event_interrupt_class event_interrupt_class_v9 = {
+-	.interrupt_isr = event_interrupt_isr_v9,
+-	.interrupt_wq = event_interrupt_wq_v9,
+-};
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+index 92a277f..70b3a99c 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+@@ -44,23 +44,24 @@
+ #include <linux/device.h>
+ #include "kfd_priv.h"
+ 
+-#define KFD_IH_NUM_ENTRIES 8192
++#define KFD_INTERRUPT_RING_SIZE 1024
+ 
+ static void interrupt_wq(struct work_struct *);
+ 
+ int kfd_interrupt_init(struct kfd_dev *kfd)
+ {
+-	int r;
+-
+-	r = kfifo_alloc(&kfd->ih_fifo,
+-		KFD_IH_NUM_ENTRIES * kfd->device_info->ih_ring_entry_size,
+-		GFP_KERNEL);
+-	if (r) {
+-		dev_err(kfd_chardev(), "Failed to allocate IH fifo\n");
+-		return r;
+-	}
++	void *interrupt_ring = kmalloc_array(KFD_INTERRUPT_RING_SIZE,
++					kfd->device_info->ih_ring_entry_size,
++					GFP_KERNEL);
++	if (!interrupt_ring)
++		return -ENOMEM;
++
++	kfd->interrupt_ring = interrupt_ring;
++	kfd->interrupt_ring_size =
++		KFD_INTERRUPT_RING_SIZE * kfd->device_info->ih_ring_entry_size;
++	atomic_set(&kfd->interrupt_ring_wptr, 0);
++	atomic_set(&kfd->interrupt_ring_rptr, 0);
+ 
+-	kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
+ 	spin_lock_init(&kfd->interrupt_lock);
+ 
+ 	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
+@@ -91,47 +92,74 @@ void kfd_interrupt_exit(struct kfd_dev *kfd)
+ 	spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
+ 
+ 	/*
+-	 * flush_work ensures that there are no outstanding
++	 * Flush_scheduled_work ensures that there are no outstanding
+ 	 * work-queue items that will access interrupt_ring. New work items
+ 	 * can't be created because we stopped interrupt handling above.
+ 	 */
+-	flush_workqueue(kfd->ih_wq);
++	flush_scheduled_work();
+ 
+-	kfifo_free(&kfd->ih_fifo);
++	kfree(kfd->interrupt_ring);
+ }
+ 
+ /*
+- * Assumption: single reader/writer. This function is not re-entrant
++ * This assumes that it can't be called concurrently with itself
++ * but only with dequeue_ih_ring_entry.
+  */
+ bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry)
+ {
+-	int count;
++	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
++	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
+ 
+-	count = kfifo_in(&kfd->ih_fifo, ih_ring_entry,
+-				kfd->device_info->ih_ring_entry_size);
+-	if (count != kfd->device_info->ih_ring_entry_size) {
++	if ((rptr - wptr) % kfd->interrupt_ring_size ==
++					kfd->device_info->ih_ring_entry_size) {
++		/* This is very bad, the system is likely to hang. */
+ 		dev_err_ratelimited(kfd_chardev(),
+-			"Interrupt ring overflow, dropping interrupt %d\n",
+-			count);
++			"Interrupt ring overflow, dropping interrupt.\n");
+ 		return false;
+ 	}
+ 
++	memcpy(kfd->interrupt_ring + wptr, ih_ring_entry,
++			kfd->device_info->ih_ring_entry_size);
++
++	wptr = (wptr + kfd->device_info->ih_ring_entry_size) %
++			kfd->interrupt_ring_size;
++	smp_wmb(); /* Ensure memcpy'd data is visible before wptr update. */
++	atomic_set(&kfd->interrupt_ring_wptr, wptr);
++
+ 	return true;
+ }
+ 
+ /*
+- * Assumption: single reader/writer. This function is not re-entrant
++ * This assumes that it can't be called concurrently with itself
++ * but only with enqueue_ih_ring_entry.
+  */
+ static bool dequeue_ih_ring_entry(struct kfd_dev *kfd, void *ih_ring_entry)
+ {
+-	int count;
++	/*
++	 * Assume that wait queues have an implicit barrier, i.e. anything that
++	 * happened in the ISR before it queued work is visible.
++	 */
++
++	unsigned int wptr = atomic_read(&kfd->interrupt_ring_wptr);
++	unsigned int rptr = atomic_read(&kfd->interrupt_ring_rptr);
+ 
+-	count = kfifo_out(&kfd->ih_fifo, ih_ring_entry,
+-				kfd->device_info->ih_ring_entry_size);
++	if (rptr == wptr)
++		return false;
+ 
+-	WARN_ON(count && count != kfd->device_info->ih_ring_entry_size);
++	memcpy(ih_ring_entry, kfd->interrupt_ring + rptr,
++			kfd->device_info->ih_ring_entry_size);
+ 
+-	return count == kfd->device_info->ih_ring_entry_size;
++	rptr = (rptr + kfd->device_info->ih_ring_entry_size) %
++			kfd->interrupt_ring_size;
++
++	/*
++	 * Ensure the rptr write update is not visible until
++	 * memcpy has finished reading.
++	 */
++	smp_mb();
++	atomic_set(&kfd->interrupt_ring_rptr, rptr);
++
++	return true;
+ }
+ 
+ static void interrupt_wq(struct work_struct *work)
+@@ -148,15 +176,13 @@ static void interrupt_wq(struct work_struct *work)
+ 								ih_ring_entry);
+ }
+ 
+-bool interrupt_is_wanted(struct kfd_dev *dev,
+-			const uint32_t *ih_ring_entry,
+-			uint32_t *patched_ihre, bool *flag)
++bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry)
+ {
+ 	/* integer and bitwise OR so there is no boolean short-circuiting */
+ 	unsigned int wanted = 0;
+ 
+ 	wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev,
+-					 ih_ring_entry, patched_ihre, flag);
++								ih_ring_entry);
+ 
+ 	return wanted != 0;
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
+deleted file mode 100644
+index 0feb366..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c
++++ /dev/null
+@@ -1,271 +0,0 @@
+-/*
+- * Copyright 2014 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#include <linux/dma-buf.h>
+-#include <linux/slab.h>
+-#include <linux/random.h>
+-
+-#include "kfd_ipc.h"
+-#include "kfd_priv.h"
+-
+-#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4
+-#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1)
+-
+-static struct kfd_ipc_handles {
+-	DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT);
+-	struct mutex lock;
+-} kfd_ipc_handles;
+-
+-/* Since, handles are random numbers, it can be used directly as hashing key.
+- * The least 4 bits of the handle are used as key. However, during import all
+- * 128 bits of the handle are checked to prevent handle snooping.
+- */
+-#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK)
+-
+-static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj)
+-{
+-	struct kfd_ipc_obj *obj;
+-
+-	obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+-	if (!obj)
+-		return -ENOMEM;
+-
+-	/* The initial ref belongs to the allocator process.
+-	 * The IPC object store itself does not hold a ref since
+-	 * there is no specific moment in time where that ref should
+-	 * be dropped, except "when there are no more userspace processes
+-	 * holding a ref to the object". Therefore the removal from IPC
+-	 * storage happens at ipc_obj release time.
+-	 */
+-	kref_init(&obj->ref);
+-	obj->data = val;
+-	get_random_bytes(obj->share_handle, sizeof(obj->share_handle));
+-
+-	memcpy(sh, obj->share_handle, sizeof(obj->share_handle));
+-
+-	mutex_lock(&kfd_ipc_handles.lock);
+-	hlist_add_head(&obj->node,
+-		&kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]);
+-	mutex_unlock(&kfd_ipc_handles.lock);
+-
+-	if (ipc_obj)
+-		*ipc_obj = obj;
+-
+-	return 0;
+-}
+-
+-static void ipc_obj_release(struct kref *r)
+-{
+-	struct kfd_ipc_obj *obj;
+-
+-	obj = container_of(r, struct kfd_ipc_obj, ref);
+-
+-	mutex_lock(&kfd_ipc_handles.lock);
+-	hash_del(&obj->node);
+-	mutex_unlock(&kfd_ipc_handles.lock);
+-
+-	dma_buf_put(obj->data);
+-	kfree(obj);
+-}
+-
+-void ipc_obj_get(struct kfd_ipc_obj *obj)
+-{
+-	kref_get(&obj->ref);
+-}
+-
+-void ipc_obj_put(struct kfd_ipc_obj **obj)
+-{
+-	kref_put(&(*obj)->ref, ipc_obj_release);
+-	*obj = NULL;
+-}
+-
+-int kfd_ipc_init(void)
+-{
+-	mutex_init(&kfd_ipc_handles.lock);
+-	hash_init(kfd_ipc_handles.handles);
+-	return 0;
+-}
+-
+-static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev,
+-			  struct kfd_process *p,
+-			  uint32_t gpu_id, struct dma_buf *dmabuf,
+-			  uint64_t va_addr, uint64_t *handle,
+-			  uint64_t *mmap_offset,
+-			  struct kfd_ipc_obj *ipc_obj)
+-{
+-	int r;
+-	void *mem;
+-	uint64_t size;
+-	int idr_handle;
+-	struct kfd_process_device *pdd = NULL;
+-
+-	if (!handle)
+-		return -EINVAL;
+-
+-	if (!dev || !dev->kfd2kgd->import_dmabuf)
+-		return -EINVAL;
+-
+-	mutex_lock(&p->mutex);
+-
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		r = PTR_ERR(pdd);
+-		goto err_unlock;
+-	}
+-
+-	r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf,
+-					va_addr, pdd->vm,
+-					(struct kgd_mem **)&mem, &size,
+-					mmap_offset);
+-	if (r)
+-		goto err_unlock;
+-
+-	idr_handle = kfd_process_device_create_obj_handle(pdd, mem,
+-							  va_addr, size,
+-							  ipc_obj);
+-	if (idr_handle < 0) {
+-		r = -EFAULT;
+-		goto err_free;
+-	}
+-
+-	mutex_unlock(&p->mutex);
+-
+-	*handle = MAKE_HANDLE(gpu_id, idr_handle);
+-
+-	return 0;
+-
+-err_free:
+-	dev->kfd2kgd->free_memory_of_gpu(dev->kgd,
+-					 (struct kgd_mem *)mem,
+-					 pdd->vm);
+-err_unlock:
+-	mutex_unlock(&p->mutex);
+-	return r;
+-}
+-
+-int kfd_ipc_import_dmabuf(struct kfd_dev *dev,
+-					   struct kfd_process *p,
+-					   uint32_t gpu_id, int dmabuf_fd,
+-					   uint64_t va_addr, uint64_t *handle,
+-					   uint64_t *mmap_offset)
+-{
+-	int r;
+-	struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd);
+-
+-	if (!dmabuf)
+-		return -EINVAL;
+-
+-	r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf,
+-					    va_addr, handle, mmap_offset,
+-					    NULL);
+-	dma_buf_put(dmabuf);
+-	return r;
+-}
+-
+-int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p,
+-			  uint32_t gpu_id, uint32_t *share_handle,
+-			  uint64_t va_addr, uint64_t *handle,
+-			  uint64_t *mmap_offset)
+-{
+-	int r;
+-	struct kfd_ipc_obj *entry, *found = NULL;
+-
+-	mutex_lock(&kfd_ipc_handles.lock);
+-	/* Convert the user provided handle to hash key and search only in that
+-	 * bucket
+-	 */
+-	hlist_for_each_entry(entry,
+-		&kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) {
+-		if (!memcmp(entry->share_handle, share_handle,
+-			    sizeof(entry->share_handle))) {
+-			found = entry;
+-			break;
+-		}
+-	}
+-	mutex_unlock(&kfd_ipc_handles.lock);
+-
+-	if (!found)
+-		return -EINVAL;
+-	ipc_obj_get(found);
+-
+-	pr_debug("Found ipc_dma_buf: %p\n", found->data);
+-
+-	r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data,
+-					    va_addr, handle, mmap_offset,
+-					    found);
+-	if (r)
+-		goto error_unref;
+-
+-	return r;
+-
+-error_unref:
+-	ipc_obj_put(&found);
+-	return r;
+-}
+-
+-int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p,
+-			     uint64_t handle, uint32_t *ipc_handle)
+-{
+-	struct kfd_process_device *pdd = NULL;
+-	struct kfd_ipc_obj *obj;
+-	struct kfd_bo *kfd_bo = NULL;
+-	struct dma_buf *dmabuf;
+-	int r;
+-
+-	if (!dev || !ipc_handle)
+-		return -EINVAL;
+-
+-	mutex_lock(&p->mutex);
+-	pdd = kfd_bind_process_to_device(dev, p);
+-	if (IS_ERR(pdd)) {
+-		mutex_unlock(&p->mutex);
+-		pr_err("Failed to get pdd\n");
+-		return PTR_ERR(pdd);
+-	}
+-
+-	kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle));
+-	mutex_unlock(&p->mutex);
+-
+-	if (!kfd_bo) {
+-		pr_err("Failed to get bo");
+-		return -EINVAL;
+-	}
+-	if (kfd_bo->kfd_ipc_obj) {
+-		memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle,
+-		       sizeof(kfd_bo->kfd_ipc_obj->share_handle));
+-		return 0;
+-	}
+-
+-	r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm,
+-					(struct kgd_mem *)kfd_bo->mem,
+-					&dmabuf);
+-	if (r)
+-		return r;
+-
+-	r = ipc_store_insert(dmabuf, ipc_handle, &obj);
+-	if (r)
+-		return r;
+-
+-	kfd_bo->kfd_ipc_obj = obj;
+-
+-	return r;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h
+deleted file mode 100644
+index 9ee8627..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h
++++ /dev/null
+@@ -1,51 +0,0 @@
+-/*
+- * Copyright 2014 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-#ifndef KFD_IPC_H_
+-#define KFD_IPC_H_
+-
+-#include <linux/types.h>
+-#include "kfd_priv.h"
+-
+-struct kfd_ipc_obj {
+-	struct hlist_node node;
+-	struct kref ref;
+-	void *data;
+-	uint32_t share_handle[4];
+-};
+-
+-int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p,
+-			  uint32_t gpu_id, uint32_t *share_handle,
+-			  uint64_t va_addr, uint64_t *handle,
+-			  uint64_t *mmap_offset);
+-int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p,
+-			  uint32_t gpu_id, int dmabuf_fd,
+-			  uint64_t va_addr, uint64_t *handle,
+-			  uint64_t *mmap_offset);
+-int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p,
+-			     uint64_t handle, uint32_t *ipc_handle);
+-
+-void ipc_obj_get(struct kfd_ipc_obj *obj);
+-void ipc_obj_put(struct kfd_ipc_obj **obj);
+-
+-#endif /* KFD_IPC_H_ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+index 8cf9d44..0649dd4 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
+ 	kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
+ 
+-	retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size,
++	retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel),
+ 					&kq->wptr_mem);
+ 
+ 	if (retval != 0)
+@@ -123,7 +123,6 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 	prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr;
+ 	prop.eop_ring_buffer_address = kq->eop_gpu_addr;
+ 	prop.eop_ring_buffer_size = PAGE_SIZE;
+-	prop.cu_mask = NULL;
+ 
+ 	if (init_queue(&kq->queue, &prop) != 0)
+ 		goto err_init_queue;
+@@ -185,8 +184,8 @@ static void uninitialize(struct kernel_queue *kq)
+ 	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
+ 		kq->mqd->destroy_mqd(kq->mqd,
+ 					kq->queue->mqd,
+-					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+-					KFD_UNMAP_LATENCY_MS,
++					false,
++					QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
+ 					kq->queue->pipe,
+ 					kq->queue->queue);
+ 	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
+@@ -209,90 +208,39 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
+ 	size_t available_size;
+ 	size_t queue_size_dwords;
+ 	uint32_t wptr, rptr;
+-	uint64_t wptr64;
+ 	unsigned int *queue_address;
+ 
+-	/* When rptr == wptr, the buffer is empty.
+-	 * When rptr == wptr + 1, the buffer is full.
+-	 * It is always rptr that advances to the position of wptr, rather than
+-	 * the opposite. So we can only use up to queue_size_dwords - 1 dwords.
+-	 */
+ 	rptr = *kq->rptr_kernel;
+-	wptr = kq->pending_wptr;
+-	wptr64 = kq->pending_wptr64;
++	wptr = *kq->wptr_kernel;
+ 	queue_address = (unsigned int *)kq->pq_kernel_addr;
+-	queue_size_dwords = kq->queue->properties.queue_size / 4;
++	queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
+ 
+ 	pr_debug("rptr: %d\n", rptr);
+ 	pr_debug("wptr: %d\n", wptr);
+ 	pr_debug("queue_address 0x%p\n", queue_address);
+ 
+-	available_size = (rptr + queue_size_dwords - 1 - wptr) %
++	available_size = (rptr - 1 - wptr + queue_size_dwords) %
+ 							queue_size_dwords;
+ 
+-	if (packet_size_in_dwords > available_size) {
++	if (packet_size_in_dwords >= queue_size_dwords ||
++			packet_size_in_dwords >= available_size) {
+ 		/*
+ 		 * make sure calling functions know
+ 		 * acquire_packet_buffer() failed
+ 		 */
+-		goto err_no_space;
++		*buffer_ptr = NULL;
++		return -ENOMEM;
+ 	}
+ 
+ 	if (wptr + packet_size_in_dwords >= queue_size_dwords) {
+-		/* make sure after rolling back to position 0, there is
+-		 * still enough space.
+-		 */
+-		if (packet_size_in_dwords >= rptr)
+-			goto err_no_space;
+-
+-		/* fill nops, roll back and start at position 0 */
+ 		while (wptr > 0) {
+ 			queue_address[wptr] = kq->nop_packet;
+ 			wptr = (wptr + 1) % queue_size_dwords;
+-			wptr64++;
+ 		}
+ 	}
+ 
+ 	*buffer_ptr = &queue_address[wptr];
+ 	kq->pending_wptr = wptr + packet_size_in_dwords;
+-	kq->pending_wptr64 = wptr64 + packet_size_in_dwords;
+-
+-	return 0;
+-
+-err_no_space:
+-	*buffer_ptr = NULL;
+-	return -ENOMEM;
+-}
+-
+-static int acquire_inline_ib(struct kernel_queue *kq,
+-			     size_t size_in_dwords,
+-			     unsigned int **buffer_ptr,
+-			     uint64_t *gpu_addr)
+-{
+-	int ret;
+-	unsigned int *buf;
+-	union PM4_MES_TYPE_3_HEADER nop;
+-
+-	if (size_in_dwords >= (1 << 14))
+-		return -EINVAL;
+-
+-	/* Allocate size_in_dwords on the ring, plus an extra dword
+-	 * for a NOP packet header
+-	 */
+-	ret = acquire_packet_buffer(kq, size_in_dwords + 1,  &buf);
+-	if (ret)
+-		return ret;
+-
+-	/* Build a NOP packet that contains the IB as "payload". */
+-	nop.u32all = 0;
+-	nop.opcode = IT_NOP;
+-	nop.count = size_in_dwords - 1;
+-	nop.type = PM4_TYPE_3;
+-
+-	*buf = nop.u32all;
+-	*buffer_ptr = buf + 1;
+-	*gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr -
+-				       (unsigned long)kq->pq_kernel_addr);
+ 
+ 	return 0;
+ }
+@@ -310,7 +258,9 @@ static void submit_packet(struct kernel_queue *kq)
+ 	pr_debug("\n");
+ #endif
+ 
+-	kq->ops_asic_specific.submit_packet(kq);
++	*kq->wptr_kernel = kq->pending_wptr;
++	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
++				kq->pending_wptr);
+ }
+ 
+ static void rollback_packet(struct kernel_queue *kq)
+@@ -330,42 +280,25 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+ 	kq->ops.initialize = initialize;
+ 	kq->ops.uninitialize = uninitialize;
+ 	kq->ops.acquire_packet_buffer = acquire_packet_buffer;
+-	kq->ops.acquire_inline_ib = acquire_inline_ib;
+ 	kq->ops.submit_packet = submit_packet;
+ 	kq->ops.rollback_packet = rollback_packet;
+ 
+ 	switch (dev->device_info->asic_family) {
+ 	case CHIP_CARRIZO:
+-	case CHIP_TONGA:
+-	case CHIP_FIJI:
+-	case CHIP_POLARIS10:
+-	case CHIP_POLARIS11:
+ 		kernel_queue_init_vi(&kq->ops_asic_specific);
+ 		break;
+ 
+ 	case CHIP_KAVERI:
+-	case CHIP_HAWAII:
+ 		kernel_queue_init_cik(&kq->ops_asic_specific);
+ 		break;
+-
+-	case CHIP_VEGA10:
+-	case CHIP_RAVEN:
+-		kernel_queue_init_v9(&kq->ops_asic_specific);
+-		break;
+-	default:
+-		WARN(1, "Unexpected ASIC family %u",
+-		     dev->device_info->asic_family);
+-		goto out_free;
+ 	}
+ 
+-	if (kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE))
+-		return kq;
+-
+-	pr_err("Failed to init kernel queue\n");
+-
+-out_free:
+-	kfree(kq);
+-	return NULL;
++	if (!kq->ops.initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE)) {
++		pr_err("Failed to init kernel queue\n");
++		kfree(kq);
++		return NULL;
++	}
++	return kq;
+ }
+ 
+ void kernel_queue_uninit(struct kernel_queue *kq)
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+index 82c94a6..5940531 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+@@ -42,12 +42,6 @@
+  * pending write pointer to that location so subsequent calls to
+  * acquire_packet_buffer will get a correct write pointer
+  *
+- * @acquire_inline_ib: Returns a pointer to the location in the kernel
+- * queue ring buffer where the calling function can write an inline IB. It is
+- * Guaranteed that there is enough space for that IB. It also updates the
+- * pending write pointer to that location so subsequent calls to
+- * acquire_packet_buffer will get a correct write pointer
+- *
+  * @submit_packet: Update the write pointer and doorbell of a kernel queue.
+  *
+  * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel
+@@ -65,10 +59,6 @@ struct kernel_queue_ops {
+ 	int	(*acquire_packet_buffer)(struct kernel_queue *kq,
+ 					size_t packet_size_in_dwords,
+ 					unsigned int **buffer_ptr);
+-	int	(*acquire_inline_ib)(struct kernel_queue *kq,
+-				     size_t packet_size_in_dwords,
+-				     unsigned int **buffer_ptr,
+-				     uint64_t *gpu_addr);
+ 
+ 	void	(*submit_packet)(struct kernel_queue *kq);
+ 	void	(*rollback_packet)(struct kernel_queue *kq);
+@@ -82,7 +72,6 @@ struct kernel_queue {
+ 	struct kfd_dev		*dev;
+ 	struct mqd_manager	*mqd;
+ 	struct queue		*queue;
+-	uint64_t		pending_wptr64;
+ 	uint32_t		pending_wptr;
+ 	unsigned int		nop_packet;
+ 
+@@ -90,10 +79,7 @@ struct kernel_queue {
+ 	uint32_t		*rptr_kernel;
+ 	uint64_t		rptr_gpu_addr;
+ 	struct kfd_mem_obj	*wptr_mem;
+-	union {
+-		uint64_t	*wptr64_kernel;
+-		uint32_t	*wptr_kernel;
+-	};
++	uint32_t		*wptr_kernel;
+ 	uint64_t		wptr_gpu_addr;
+ 	struct kfd_mem_obj	*pq;
+ 	uint64_t		pq_gpu_addr;
+@@ -111,6 +97,5 @@ struct kernel_queue {
+ 
+ void kernel_queue_init_cik(struct kernel_queue_ops *ops);
+ void kernel_queue_init_vi(struct kernel_queue_ops *ops);
+-void kernel_queue_init_v9(struct kernel_queue_ops *ops);
+ 
+ #endif /* KFD_KERNEL_QUEUE_H_ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+index 2808422..a90eb44 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c
+@@ -22,19 +22,15 @@
+  */
+ 
+ #include "kfd_kernel_queue.h"
+-#include "kfd_pm4_headers.h"
+-#include "kfd_pm4_opcodes.h"
+ 
+ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+ 			enum kfd_queue_type type, unsigned int queue_size);
+ static void uninitialize_cik(struct kernel_queue *kq);
+-static void submit_packet_cik(struct kernel_queue *kq);
+ 
+ void kernel_queue_init_cik(struct kernel_queue_ops *ops)
+ {
+ 	ops->initialize = initialize_cik;
+ 	ops->uninitialize = uninitialize_cik;
+-	ops->submit_packet = submit_packet_cik;
+ }
+ 
+ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+@@ -46,127 +42,3 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev,
+ static void uninitialize_cik(struct kernel_queue *kq)
+ {
+ }
+-
+-static void submit_packet_cik(struct kernel_queue *kq)
+-{
+-	*kq->wptr_kernel = kq->pending_wptr;
+-	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+-				kq->pending_wptr);
+-}
+-
+-static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer,
+-				struct qcm_process_device *qpd)
+-{
+-	struct pm4_map_process *packet;
+-
+-	packet = (struct pm4_map_process *)buffer;
+-
+-	memset(buffer, 0, sizeof(struct pm4_map_process));
+-
+-	packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS,
+-					sizeof(struct pm4_map_process));
+-	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+-	packet->bitfields2.process_quantum = 1;
+-	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+-	packet->bitfields3.page_table_base = qpd->page_table_base;
+-	packet->bitfields10.gds_size = qpd->gds_size;
+-	packet->bitfields10.num_gws = qpd->num_gws;
+-	packet->bitfields10.num_oac = qpd->num_oac;
+-	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+-
+-	packet->sh_mem_config = qpd->sh_mem_config;
+-	packet->sh_mem_bases = qpd->sh_mem_bases;
+-	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+-	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+-
+-	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+-	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+-
+-	return 0;
+-}
+-
+-static int pm_map_process_scratch_cik(struct packet_manager *pm,
+-		uint32_t *buffer, struct qcm_process_device *qpd)
+-{
+-	struct pm4_map_process_scratch_kv *packet;
+-
+-	packet = (struct pm4_map_process_scratch_kv *)buffer;
+-
+-	memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv));
+-
+-	packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS,
+-				sizeof(struct pm4_map_process_scratch_kv));
+-	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+-	packet->bitfields2.process_quantum = 1;
+-	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+-	packet->bitfields3.page_table_base = qpd->page_table_base;
+-	packet->bitfields14.gds_size = qpd->gds_size;
+-	packet->bitfields14.num_gws = qpd->num_gws;
+-	packet->bitfields14.num_oac = qpd->num_oac;
+-	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+-
+-	packet->sh_mem_config = qpd->sh_mem_config;
+-	packet->sh_mem_bases = qpd->sh_mem_bases;
+-	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+-	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+-
+-	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+-
+-	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+-	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+-
+-	return 0;
+-}
+-
+-static uint32_t pm_get_map_process_packet_size_cik(void)
+-{
+-	return sizeof(struct pm4_map_process);
+-}
+-static uint32_t pm_get_map_process_scratch_packet_size_cik(void)
+-{
+-	return sizeof(struct pm4_map_process_scratch_kv);
+-}
+-
+-
+-static struct packet_manager_funcs kfd_cik_pm_funcs = {
+-	.map_process			= pm_map_process_cik,
+-	.runlist			= pm_runlist_vi,
+-	.set_resources			= pm_set_resources_vi,
+-	.map_queues			= pm_map_queues_vi,
+-	.unmap_queues			= pm_unmap_queues_vi,
+-	.query_status			= pm_query_status_vi,
+-	.release_mem			= pm_release_mem_vi,
+-	.get_map_process_packet_size	= pm_get_map_process_packet_size_cik,
+-	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
+-	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
+-	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
+-	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
+-	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
+-	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
+-};
+-
+-static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = {
+-	.map_process			= pm_map_process_scratch_cik,
+-	.runlist			= pm_runlist_vi,
+-	.set_resources			= pm_set_resources_vi,
+-	.map_queues			= pm_map_queues_vi,
+-	.unmap_queues			= pm_unmap_queues_vi,
+-	.query_status			= pm_query_status_vi,
+-	.release_mem			= pm_release_mem_vi,
+-	.get_map_process_packet_size	=
+-				pm_get_map_process_scratch_packet_size_cik,
+-	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
+-	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
+-	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
+-	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
+-	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
+-	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
+-};
+-
+-void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver)
+-{
+-	if (fw_ver >= KFD_SCRATCH_KV_FW_VER)
+-		pm->pmf = &kfd_cik_scratch_pm_funcs;
+-	else
+-		pm->pmf = &kfd_cik_pm_funcs;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+deleted file mode 100644
+index 5fe4f60..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
++++ /dev/null
+@@ -1,377 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-#include "kfd_kernel_queue.h"
+-#include "kfd_device_queue_manager.h"
+-#include "kfd_pm4_headers_ai.h"
+-#include "kfd_pm4_opcodes.h"
+-
+-static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+-			enum kfd_queue_type type, unsigned int queue_size);
+-static void uninitialize_v9(struct kernel_queue *kq);
+-static void submit_packet_v9(struct kernel_queue *kq);
+-
+-void kernel_queue_init_v9(struct kernel_queue_ops *ops)
+-{
+-	ops->initialize = initialize_v9;
+-	ops->uninitialize = uninitialize_v9;
+-	ops->submit_packet = submit_packet_v9;
+-}
+-
+-static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev,
+-			enum kfd_queue_type type, unsigned int queue_size)
+-{
+-	int retval;
+-
+-	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+-	if (retval != 0)
+-		return false;
+-
+-	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+-	kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+-
+-	memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+-
+-	return true;
+-}
+-
+-static void uninitialize_v9(struct kernel_queue *kq)
+-{
+-	kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+-}
+-
+-static void submit_packet_v9(struct kernel_queue *kq)
+-{
+-	*kq->wptr64_kernel = kq->pending_wptr64;
+-	write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
+-				kq->pending_wptr64);
+-}
+-
+-static int pm_map_process_v9(struct packet_manager *pm,
+-		uint32_t *buffer, struct qcm_process_device *qpd)
+-{
+-	struct pm4_mes_map_process *packet;
+-	uint64_t vm_page_table_base_addr =
+-		(uint64_t)(qpd->page_table_base) << 12;
+-
+-	packet = (struct pm4_mes_map_process *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+-					sizeof(struct pm4_mes_map_process));
+-	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+-	packet->bitfields2.process_quantum = 1;
+-	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+-	packet->bitfields14.gds_size = qpd->gds_size;
+-	packet->bitfields14.num_gws = qpd->num_gws;
+-	packet->bitfields14.num_oac = qpd->num_oac;
+-	packet->bitfields14.sdma_enable = 1;
+-	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+-
+-	packet->sh_mem_config = qpd->sh_mem_config;
+-	packet->sh_mem_bases = qpd->sh_mem_bases;
+-	packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
+-	packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8);
+-	packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
+-	packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
+-
+-	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+-	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+-
+-	packet->vm_context_page_table_base_addr_lo32 =
+-			lower_32_bits(vm_page_table_base_addr);
+-	packet->vm_context_page_table_base_addr_hi32 =
+-			upper_32_bits(vm_page_table_base_addr);
+-
+-	return 0;
+-}
+-
+-static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+-{
+-	struct pm4_mes_runlist *packet;
+-
+-	int concurrent_proc_cnt = 0;
+-	struct kfd_dev *kfd = pm->dqm->dev;
+-
+-	/* Determine the number of processes to map together to HW:
+-	 * it can not exceed the number of VMIDs available to the
+-	 * scheduler, and it is determined by the smaller of the number
+-	 * of processes in the runlist and kfd module parameter
+-	 * hws_max_conc_proc.
+-	 * Note: the arbitration between the number of VMIDs and
+-	 * hws_max_conc_proc has been done in
+-	 * kgd2kfd_device_init().
+-	 */
+-	concurrent_proc_cnt = min(pm->dqm->processes_count,
+-			kfd->max_proc_per_quantum);
+-
+-
+-	packet = (struct pm4_mes_runlist *)buffer;
+-
+-	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+-	packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+-						sizeof(struct pm4_mes_runlist));
+-
+-	packet->bitfields4.ib_size = ib_size_in_dwords;
+-	packet->bitfields4.chain = chain ? 1 : 0;
+-	packet->bitfields4.offload_polling = 0;
+-	packet->bitfields4.valid = 1;
+-	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+-	packet->ordinal2 = lower_32_bits(ib);
+-	packet->ib_base_hi = upper_32_bits(ib);
+-
+-	return 0;
+-}
+-
+-static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+-		struct queue *q, bool is_static)
+-{
+-	struct pm4_mes_map_queues *packet;
+-	bool use_static = is_static;
+-
+-	packet = (struct pm4_mes_map_queues *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+-					sizeof(struct pm4_mes_map_queues));
+-	packet->bitfields2.alloc_format =
+-		alloc_format__mes_map_queues__one_per_pipe_vi;
+-	packet->bitfields2.num_queues = 1;
+-	packet->bitfields2.queue_sel =
+-		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+-
+-	packet->bitfields2.engine_sel =
+-		engine_sel__mes_map_queues__compute_vi;
+-	packet->bitfields2.queue_type =
+-		queue_type__mes_map_queues__normal_compute_vi;
+-
+-	switch (q->properties.type) {
+-	case KFD_QUEUE_TYPE_COMPUTE:
+-		if (use_static)
+-			packet->bitfields2.queue_type =
+-		queue_type__mes_map_queues__normal_latency_static_queue_vi;
+-		break;
+-	case KFD_QUEUE_TYPE_DIQ:
+-		packet->bitfields2.queue_type =
+-			queue_type__mes_map_queues__debug_interface_queue_vi;
+-		break;
+-	case KFD_QUEUE_TYPE_SDMA:
+-		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+-				engine_sel__mes_map_queues__sdma0_vi;
+-		use_static = false; /* no static queues under SDMA */
+-		break;
+-	default:
+-		WARN(1, "queue type %d", q->properties.type);
+-		return -EINVAL;
+-	}
+-	packet->bitfields3.doorbell_offset =
+-			q->properties.doorbell_off;
+-
+-	packet->mqd_addr_lo =
+-			lower_32_bits(q->gart_mqd_addr);
+-
+-	packet->mqd_addr_hi =
+-			upper_32_bits(q->gart_mqd_addr);
+-
+-	packet->wptr_addr_lo =
+-			lower_32_bits((uint64_t)q->properties.write_ptr);
+-
+-	packet->wptr_addr_hi =
+-			upper_32_bits((uint64_t)q->properties.write_ptr);
+-
+-	return 0;
+-}
+-
+-static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
+-			enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter filter,
+-			uint32_t filter_param, bool reset,
+-			unsigned int sdma_engine)
+-{
+-	struct pm4_mes_unmap_queues *packet;
+-
+-	packet = (struct pm4_mes_unmap_queues *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+-					sizeof(struct pm4_mes_unmap_queues));
+-	switch (type) {
+-	case KFD_QUEUE_TYPE_COMPUTE:
+-	case KFD_QUEUE_TYPE_DIQ:
+-		packet->bitfields2.engine_sel =
+-			engine_sel__mes_unmap_queues__compute;
+-		break;
+-	case KFD_QUEUE_TYPE_SDMA:
+-		packet->bitfields2.engine_sel =
+-			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+-		break;
+-	default:
+-		WARN(1, "queue type %d", type);
+-		return -EINVAL;
+-	}
+-
+-	if (reset)
+-		packet->bitfields2.action =
+-			action__mes_unmap_queues__reset_queues;
+-	else
+-		packet->bitfields2.action =
+-			action__mes_unmap_queues__preempt_queues;
+-
+-	switch (filter) {
+-	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+-		packet->bitfields2.num_queues = 1;
+-		packet->bitfields3b.doorbell_offset0 = filter_param;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+-		packet->bitfields3a.pasid = filter_param;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__unmap_all_queues;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+-		/* in this case, we do not preempt static queues */
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+-		break;
+-	default:
+-		WARN(1, "filter %d", filter);
+-		return -EINVAL;
+-	}
+-
+-	return 0;
+-
+-}
+-
+-static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t fence_address,	uint32_t fence_value)
+-{
+-	struct pm4_mes_query_status *packet;
+-
+-	packet = (struct pm4_mes_query_status *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+-
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+-					sizeof(struct pm4_mes_query_status));
+-
+-	packet->bitfields2.context_id = 0;
+-	packet->bitfields2.interrupt_sel =
+-			interrupt_sel__mes_query_status__completion_status;
+-	packet->bitfields2.command =
+-			command__mes_query_status__fence_only_after_write_ack;
+-
+-	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+-	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+-	packet->data_hi = upper_32_bits((uint64_t)fence_value);
+-	packet->data_lo = lower_32_bits((uint64_t)fence_value);
+-
+-	return 0;
+-}
+-
+-
+-static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
+-{
+-	struct pm4_mec_release_mem *packet;
+-
+-	packet = (struct pm4_mec_release_mem *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+-					sizeof(struct pm4_mec_release_mem));
+-
+-	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+-	packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+-	packet->bitfields2.tcl1_action_ena = 1;
+-	packet->bitfields2.tc_action_ena = 1;
+-	packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
+-
+-	packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+-	packet->bitfields3.int_sel =
+-		int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+-
+-	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+-	packet->address_hi = upper_32_bits(gpu_addr);
+-
+-	packet->data_lo = 0;
+-
+-	return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
+-}
+-
+-static uint32_t pm_get_map_process_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mes_map_process);
+-}
+-
+-static uint32_t pm_get_runlist_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mes_runlist);
+-}
+-
+-static uint32_t pm_get_map_queues_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mes_map_queues);
+-}
+-
+-static uint32_t pm_get_unmap_queues_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mes_unmap_queues);
+-}
+-
+-static uint32_t pm_get_query_status_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mes_query_status);
+-}
+-
+-static uint32_t pm_get_release_mem_packet_size_v9(void)
+-{
+-	return sizeof(struct pm4_mec_release_mem);
+-}
+-
+-static struct packet_manager_funcs kfd_v9_pm_funcs = {
+-	.map_process			= pm_map_process_v9,
+-	.runlist			= pm_runlist_v9,
+-	.set_resources			= pm_set_resources_vi,
+-	.map_queues			= pm_map_queues_v9,
+-	.unmap_queues			= pm_unmap_queues_v9,
+-	.query_status			= pm_query_status_v9,
+-	.release_mem			= pm_release_mem_v9,
+-	.get_map_process_packet_size	= pm_get_map_process_packet_size_v9,
+-	.get_runlist_packet_size	= pm_get_runlist_packet_size_v9,
+-	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
+-	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_v9,
+-	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_v9,
+-	.get_query_status_packet_size	= pm_get_query_status_packet_size_v9,
+-	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_v9,
+-};
+-
+-void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver)
+-{
+-	pm->pmf = &kfd_v9_pm_funcs;
+-}
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+index 9022ecb..f1d4828 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+@@ -22,20 +22,15 @@
+  */
+ 
+ #include "kfd_kernel_queue.h"
+-#include "kfd_device_queue_manager.h"
+-#include "kfd_pm4_headers_vi.h"
+-#include "kfd_pm4_opcodes.h"
+ 
+ static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
+ 			enum kfd_queue_type type, unsigned int queue_size);
+ static void uninitialize_vi(struct kernel_queue *kq);
+-static void submit_packet_vi(struct kernel_queue *kq);
+ 
+ void kernel_queue_init_vi(struct kernel_queue_ops *ops)
+ {
+ 	ops->initialize = initialize_vi;
+ 	ops->uninitialize = uninitialize_vi;
+-	ops->submit_packet = submit_packet_vi;
+ }
+ 
+ static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev,
+@@ -59,359 +54,3 @@ static void uninitialize_vi(struct kernel_queue *kq)
+ {
+ 	kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+ }
+-
+-static void submit_packet_vi(struct kernel_queue *kq)
+-{
+-	*kq->wptr_kernel = kq->pending_wptr;
+-	write_kernel_doorbell(kq->queue->properties.doorbell_ptr,
+-				kq->pending_wptr);
+-}
+-
+-static int pm_map_process_vi(struct packet_manager *pm,
+-		uint32_t *buffer, struct qcm_process_device *qpd)
+-{
+-	struct pm4_mes_map_process *packet;
+-
+-	packet = (struct pm4_mes_map_process *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+-					sizeof(struct pm4_mes_map_process));
+-	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+-	packet->bitfields2.process_quantum = 1;
+-	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+-	packet->bitfields3.page_table_base = qpd->page_table_base;
+-	packet->bitfields10.gds_size = qpd->gds_size;
+-	packet->bitfields10.num_gws = qpd->num_gws;
+-	packet->bitfields10.num_oac = qpd->num_oac;
+-	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+-
+-	packet->sh_mem_config = qpd->sh_mem_config;
+-	packet->sh_mem_bases = qpd->sh_mem_bases;
+-	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
+-	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
+-
+-	packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base;
+-
+-	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+-	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+-
+-	return 0;
+-}
+-
+-
+-unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size)
+-{
+-	union PM4_MES_TYPE_3_HEADER header;
+-
+-	header.u32All = 0;
+-	header.opcode = opcode;
+-	header.count = packet_size / 4 - 2;
+-	header.type = PM4_TYPE_3;
+-
+-	return header.u32All;
+-}
+-
+-int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+-{
+-	struct pm4_mes_runlist *packet;
+-
+-	int concurrent_proc_cnt = 0;
+-	struct kfd_dev *kfd = pm->dqm->dev;
+-
+-	/* Determine the number of processes to map together to HW:
+-	 * it can not exceed the number of VMIDs available to the
+-	 * scheduler, and it is determined by the smaller of the number
+-	 * of processes in the runlist and kfd module parameter
+-	 * hws_max_conc_proc.
+-	 * Note: the arbitration between the number of VMIDs and
+-	 * hws_max_conc_proc has been done in
+-	 * kgd2kfd_device_init().
+-	 */
+-	concurrent_proc_cnt = min(pm->dqm->processes_count,
+-			kfd->max_proc_per_quantum);
+-
+-
+-	packet = (struct pm4_mes_runlist *)buffer;
+-
+-	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+-	packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+-						sizeof(struct pm4_mes_runlist));
+-
+-	packet->bitfields4.ib_size = ib_size_in_dwords;
+-	packet->bitfields4.chain = chain ? 1 : 0;
+-	packet->bitfields4.offload_polling = 0;
+-	packet->bitfields4.valid = 1;
+-	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+-	packet->ordinal2 = lower_32_bits(ib);
+-	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
+-
+-	return 0;
+-}
+-
+-int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+-		struct queue *q, bool is_static)
+-{
+-	struct pm4_mes_map_queues *packet;
+-	bool use_static = is_static;
+-
+-	packet = (struct pm4_mes_map_queues *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+-					sizeof(struct pm4_mes_map_queues));
+-	packet->bitfields2.alloc_format =
+-		alloc_format__mes_map_queues__one_per_pipe_vi;
+-	packet->bitfields2.num_queues = 1;
+-	packet->bitfields2.queue_sel =
+-		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+-
+-	packet->bitfields2.engine_sel =
+-		engine_sel__mes_map_queues__compute_vi;
+-	packet->bitfields2.queue_type =
+-		queue_type__mes_map_queues__normal_compute_vi;
+-
+-	switch (q->properties.type) {
+-	case KFD_QUEUE_TYPE_COMPUTE:
+-		if (use_static)
+-			packet->bitfields2.queue_type =
+-		queue_type__mes_map_queues__normal_latency_static_queue_vi;
+-		break;
+-	case KFD_QUEUE_TYPE_DIQ:
+-		packet->bitfields2.queue_type =
+-			queue_type__mes_map_queues__debug_interface_queue_vi;
+-		break;
+-	case KFD_QUEUE_TYPE_SDMA:
+-		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+-				engine_sel__mes_map_queues__sdma0_vi;
+-		use_static = false; /* no static queues under SDMA */
+-		break;
+-	default:
+-		WARN(1, "queue type %d", q->properties.type);
+-		return -EINVAL;
+-	}
+-	packet->bitfields3.doorbell_offset =
+-			q->properties.doorbell_off;
+-
+-	packet->mqd_addr_lo =
+-			lower_32_bits(q->gart_mqd_addr);
+-
+-	packet->mqd_addr_hi =
+-			upper_32_bits(q->gart_mqd_addr);
+-
+-	packet->wptr_addr_lo =
+-			lower_32_bits((uint64_t)q->properties.write_ptr);
+-
+-	packet->wptr_addr_hi =
+-			upper_32_bits((uint64_t)q->properties.write_ptr);
+-
+-	return 0;
+-}
+-
+-int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+-				struct scheduling_resources *res)
+-{
+-	struct pm4_mes_set_resources *packet;
+-
+-	packet = (struct pm4_mes_set_resources *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
+-					sizeof(struct pm4_mes_set_resources));
+-
+-	packet->bitfields2.queue_type =
+-			queue_type__mes_set_resources__hsa_interface_queue_hiq;
+-	packet->bitfields2.vmid_mask = res->vmid_mask;
+-	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+-	packet->bitfields7.oac_mask = res->oac_mask;
+-	packet->bitfields8.gds_heap_base = res->gds_heap_base;
+-	packet->bitfields8.gds_heap_size = res->gds_heap_size;
+-
+-	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
+-	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
+-
+-	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
+-	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
+-
+-	return 0;
+-}
+-
+-int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+-			enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter filter,
+-			uint32_t filter_param, bool reset,
+-			unsigned int sdma_engine)
+-{
+-	struct pm4_mes_unmap_queues *packet;
+-
+-	packet = (struct pm4_mes_unmap_queues *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+-					sizeof(struct pm4_mes_unmap_queues));
+-	switch (type) {
+-	case KFD_QUEUE_TYPE_COMPUTE:
+-	case KFD_QUEUE_TYPE_DIQ:
+-		packet->bitfields2.engine_sel =
+-			engine_sel__mes_unmap_queues__compute;
+-		break;
+-	case KFD_QUEUE_TYPE_SDMA:
+-		packet->bitfields2.engine_sel =
+-			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+-		break;
+-	default:
+-		WARN(1, "queue type %d", type);
+-		return -EINVAL;
+-	}
+-
+-	if (reset)
+-		packet->bitfields2.action =
+-			action__mes_unmap_queues__reset_queues;
+-	else
+-		packet->bitfields2.action =
+-			action__mes_unmap_queues__preempt_queues;
+-
+-	switch (filter) {
+-	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+-		packet->bitfields2.num_queues = 1;
+-		packet->bitfields3b.doorbell_offset0 = filter_param;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+-		packet->bitfields3a.pasid = filter_param;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__unmap_all_queues;
+-		break;
+-	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+-		/* in this case, we do not preempt static queues */
+-		packet->bitfields2.queue_sel =
+-			queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+-		break;
+-	default:
+-		WARN(1, "filter %d", filter);
+-		return -EINVAL;
+-	}
+-
+-	return 0;
+-
+-}
+-
+-int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t fence_address,	uint32_t fence_value)
+-{
+-	struct pm4_mes_query_status *packet;
+-
+-	packet = (struct pm4_mes_query_status *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+-
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+-					sizeof(struct pm4_mes_query_status));
+-
+-	packet->bitfields2.context_id = 0;
+-	packet->bitfields2.interrupt_sel =
+-			interrupt_sel__mes_query_status__completion_status;
+-	packet->bitfields2.command =
+-			command__mes_query_status__fence_only_after_write_ack;
+-
+-	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+-	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+-	packet->data_hi = upper_32_bits((uint64_t)fence_value);
+-	packet->data_lo = lower_32_bits((uint64_t)fence_value);
+-
+-	return 0;
+-}
+-
+-
+-uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
+-{
+-	struct pm4_mec_release_mem *packet;
+-
+-	packet = (struct pm4_mec_release_mem *)buffer;
+-	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+-
+-	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+-					sizeof(struct pm4_mec_release_mem));
+-
+-	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+-	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+-	packet->bitfields2.tcl1_action_ena = 1;
+-	packet->bitfields2.tc_action_ena = 1;
+-	packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+-	packet->bitfields2.atc = 0;
+-
+-	packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
+-	packet->bitfields3.int_sel =
+-		int_sel___release_mem__send_interrupt_after_write_confirm;
+-
+-	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+-	packet->address_hi = upper_32_bits(gpu_addr);
+-
+-	packet->data_lo = 0;
+-
+-	return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
+-}
+-
+-uint32_t pm_get_map_process_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_map_process);
+-}
+-
+-uint32_t pm_get_runlist_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_runlist);
+-}
+-
+-uint32_t pm_get_set_resources_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_set_resources);
+-}
+-
+-uint32_t pm_get_map_queues_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_map_queues);
+-}
+-
+-uint32_t pm_get_unmap_queues_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_unmap_queues);
+-}
+-
+-uint32_t pm_get_query_status_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mes_query_status);
+-}
+-
+-uint32_t pm_get_release_mem_packet_size_vi(void)
+-{
+-	return sizeof(struct pm4_mec_release_mem);
+-}
+-
+-
+-static struct packet_manager_funcs kfd_vi_pm_funcs = {
+-	.map_process			= pm_map_process_vi,
+-	.runlist			= pm_runlist_vi,
+-	.set_resources			= pm_set_resources_vi,
+-	.map_queues			= pm_map_queues_vi,
+-	.unmap_queues			= pm_unmap_queues_vi,
+-	.query_status			= pm_query_status_vi,
+-	.release_mem			= pm_release_mem_vi,
+-	.get_map_process_packet_size	= pm_get_map_process_packet_size_vi,
+-	.get_runlist_packet_size	= pm_get_runlist_packet_size_vi,
+-	.get_set_resources_packet_size	= pm_get_set_resources_packet_size_vi,
+-	.get_map_queues_packet_size	= pm_get_map_queues_packet_size_vi,
+-	.get_unmap_queues_packet_size	= pm_get_unmap_queues_packet_size_vi,
+-	.get_query_status_packet_size	= pm_get_query_status_packet_size_vi,
+-	.get_release_mem_packet_size	= pm_get_release_mem_packet_size_vi,
+-};
+-
+-void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver)
+-{
+-	pm->pmf = &kfd_vi_pm_funcs;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+index b6f9d23..0d73bea 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+@@ -29,10 +29,10 @@
+ #define KFD_DRIVER_AUTHOR	"AMD Inc. and others"
+ 
+ #define KFD_DRIVER_DESC		"Standalone HSA driver for AMD's GPUs"
+-#define KFD_DRIVER_DATE		"20160408"
+-#define KFD_DRIVER_MAJOR	2
+-#define KFD_DRIVER_MINOR	0
+-#define KFD_DRIVER_PATCHLEVEL	0
++#define KFD_DRIVER_DATE		"20150421"
++#define KFD_DRIVER_MAJOR	0
++#define KFD_DRIVER_MINOR	7
++#define KFD_DRIVER_PATCHLEVEL	2
+ 
+ static const struct kgd2kfd_calls kgd2kfd = {
+ 	.exit		= kgd2kfd_exit,
+@@ -42,12 +42,6 @@ static const struct kgd2kfd_calls kgd2kfd = {
+ 	.interrupt	= kgd2kfd_interrupt,
+ 	.suspend	= kgd2kfd_suspend,
+ 	.resume		= kgd2kfd_resume,
+-	.quiesce_mm	= kgd2kfd_quiesce_mm,
+-	.resume_mm	= kgd2kfd_resume_mm,
+-	.schedule_evict_and_restore_process =
+-			  kgd2kfd_schedule_evict_and_restore_process,
+-	.pre_reset	= kgd2kfd_pre_reset,
+-	.post_reset	= kgd2kfd_post_reset,
+ };
+ 
+ int sched_policy = KFD_SCHED_POLICY_HWS;
+@@ -55,15 +49,6 @@ module_param(sched_policy, int, 0444);
+ MODULE_PARM_DESC(sched_policy,
+ 	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
+ 
+-int hws_max_conc_proc = 8;
+-module_param(hws_max_conc_proc, int, 0444);
+-MODULE_PARM_DESC(hws_max_conc_proc,
+-	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
+-
+-int cwsr_enable = 1;
+-module_param(cwsr_enable, int, 0444);
+-MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+-
+ int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
+ module_param(max_num_of_queues_per_device, int, 0444);
+ MODULE_PARM_DESC(max_num_of_queues_per_device,
+@@ -76,33 +61,6 @@ MODULE_PARM_DESC(send_sigterm,
+ 
+ static int amdkfd_init_completed;
+ 
+-int debug_largebar;
+-module_param(debug_largebar, int, 0444);
+-MODULE_PARM_DESC(debug_largebar,
+-	"Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
+-
+-int ignore_crat;
+-module_param(ignore_crat, int, 0444);
+-MODULE_PARM_DESC(ignore_crat,
+-	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
+-
+-int vega10_noretry;
+-module_param_named(noretry, vega10_noretry, int, 0644);
+-MODULE_PARM_DESC(noretry,
+-	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
+-
+-int priv_cp_queues;
+-module_param(priv_cp_queues, int, 0644);
+-MODULE_PARM_DESC(priv_cp_queues,
+-	"Enable privileged mode for CP queues (0 = off (default), 1 = on)");
+-
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) && defined(BUILD_AS_DKMS)
+-int cma_enable;
+-module_param(cma_enable, int, 0644);
+-MODULE_PARM_DESC(cma_enable,
+-	"Enable CMA (1 = enable, 0 = disable (default)). Warning! relaxed access check");
+-#endif
+-
+ int kgd2kfd_init(unsigned int interface_version,
+ 		const struct kgd2kfd_calls **g2f)
+ {
+@@ -145,6 +103,10 @@ static int __init kfd_module_init(void)
+ 		return -1;
+ 	}
+ 
++	err = kfd_pasid_init();
++	if (err < 0)
++		return err;
++
+ 	err = kfd_chardev_init();
+ 	if (err < 0)
+ 		goto err_ioctl;
+@@ -153,16 +115,8 @@ static int __init kfd_module_init(void)
+ 	if (err < 0)
+ 		goto err_topology;
+ 
+-	err = kfd_ipc_init();
+-	if (err < 0)
+-		goto err_topology;
+-
+ 	kfd_process_create_wq();
+ 
+-	kfd_init_peer_direct();
+-
+-	kfd_debugfs_init();
+-
+ 	amdkfd_init_completed = 1;
+ 
+ 	dev_info(kfd_device, "Initialized module\n");
+@@ -172,6 +126,7 @@ static int __init kfd_module_init(void)
+ err_topology:
+ 	kfd_chardev_exit();
+ err_ioctl:
++	kfd_pasid_exit();
+ 	return err;
+ }
+ 
+@@ -179,11 +134,10 @@ static void __exit kfd_module_exit(void)
+ {
+ 	amdkfd_init_completed = 0;
+ 
+-	kfd_debugfs_fini();
+-	kfd_close_peer_direct();
+ 	kfd_process_destroy_wq();
+ 	kfd_topology_shutdown();
+ 	kfd_chardev_exit();
++	kfd_pasid_exit();
+ 	dev_info(kfd_device, "Removed module\n");
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+index 8279b74..b1ef136 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+@@ -23,69 +23,14 @@
+ 
+ #include "kfd_priv.h"
+ 
+-/* Mapping queue priority to pipe priority, indexed by queue priority */
+-int pipe_priority_map[] = {
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_LOW,
+-	KFD_PIPE_PRIORITY_CS_MEDIUM,
+-	KFD_PIPE_PRIORITY_CS_MEDIUM,
+-	KFD_PIPE_PRIORITY_CS_MEDIUM,
+-	KFD_PIPE_PRIORITY_CS_MEDIUM,
+-	KFD_PIPE_PRIORITY_CS_HIGH,
+-	KFD_PIPE_PRIORITY_CS_HIGH,
+-	KFD_PIPE_PRIORITY_CS_HIGH,
+-	KFD_PIPE_PRIORITY_CS_HIGH,
+-	KFD_PIPE_PRIORITY_CS_HIGH
+-};
+-
+-/* Mapping queue priority to SPI priority, indexed by queue priority
+- * SPI priority 2 and 3 are reserved for trap handler context save
+- */
+-int spi_priority_map[] = {
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_EXTRA_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_LOW
+-};
+-
+ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+ 					struct kfd_dev *dev)
+ {
+ 	switch (dev->device_info->asic_family) {
+ 	case CHIP_KAVERI:
+ 		return mqd_manager_init_cik(type, dev);
+-	case CHIP_HAWAII:
+-		return mqd_manager_init_cik_hawaii(type, dev);
+ 	case CHIP_CARRIZO:
+ 		return mqd_manager_init_vi(type, dev);
+-	case CHIP_TONGA:
+-	case CHIP_FIJI:
+-	case CHIP_POLARIS10:
+-	case CHIP_POLARIS11:
+-		return mqd_manager_init_vi_tonga(type, dev);
+-	case CHIP_VEGA10:
+-	case CHIP_RAVEN:
+-		return mqd_manager_init_v9(type, dev);
+-	default:
+-		WARN(1, "Unexpected ASIC family %u",
+-		     dev->device_info->asic_family);
+ 	}
+ 
+ 	return NULL;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+index dcaeda8..1f3a6ba 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+@@ -43,9 +43,6 @@
+  *
+  * @is_occupied: Checks if the relevant HQD slot is occupied.
+  *
+- * @get_wave_state: Retrieves context save state and optionally copies the
+- * control stack, if kept in the MQD, to the given userspace address.
+- *
+  * @mqd_mutex: Mqd manager mutex.
+  *
+  * @dev: The kfd device structure coupled with this module.
+@@ -62,8 +59,7 @@
+  * per KFD_MQD_TYPE for each device.
+  *
+  */
+-extern int pipe_priority_map[];
+-extern int spi_priority_map[];
++
+ struct mqd_manager {
+ 	int	(*init_mqd)(struct mqd_manager *mm, void **mqd,
+ 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+@@ -89,15 +85,6 @@ struct mqd_manager {
+ 				uint64_t queue_address,	uint32_t pipe_id,
+ 				uint32_t queue_id);
+ 
+-	int	(*get_wave_state)(struct mqd_manager *mm, void *mqd,
+-				  void __user *ctl_stack,
+-				  u32 *ctl_stack_used_size,
+-				  u32 *save_area_used_size);
+-
+-#if defined(CONFIG_DEBUG_FS)
+-	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
+-#endif
+-
+ 	struct mutex	mqd_mutex;
+ 	struct kfd_dev	*dev;
+ };
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+index 602da80..44ffd23 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+@@ -30,80 +30,12 @@
+ #include "cik_regs.h"
+ #include "cik_structs.h"
+ #include "oss/oss_2_4_sh_mask.h"
+-#include "gca/gfx_7_2_sh_mask.h"
+ 
+ static inline struct cik_mqd *get_mqd(void *mqd)
+ {
+ 	return (struct cik_mqd *)mqd;
+ }
+ 
+-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+-{
+-	return (struct cik_sdma_rlc_registers *)mqd;
+-}
+-
+-static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	struct cik_mqd *m;
+-	struct kfd_cu_info cu_info;
+-	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+-	uint32_t cu_mask_count = q->cu_mask_count;
+-	const uint32_t *cu_mask = q->cu_mask;
+-	int se, cu_per_sh, cu_index, i;
+-
+-	if (cu_mask_count == 0)
+-		return;
+-
+-	m = get_mqd(mqd);
+-	m->compute_static_thread_mgmt_se0 = 0;
+-	m->compute_static_thread_mgmt_se1 = 0;
+-	m->compute_static_thread_mgmt_se2 = 0;
+-	m->compute_static_thread_mgmt_se3 = 0;
+-
+-	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+-
+-	/* If # CU mask bits > # CUs, set it to the # of CUs */
+-	if (cu_mask_count > cu_info.cu_active_number)
+-		cu_mask_count = cu_info.cu_active_number;
+-
+-	cu_index = 0;
+-	for (se = 0; se < cu_info.num_shader_engines; se++) {
+-		cu_per_sh = 0;
+-
+-		/* Get the number of CUs on this Shader Engine */
+-		for (i = 0; i < 4; i++)
+-			cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]);
+-
+-		se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32);
+-		if ((cu_per_sh + (cu_index % 32)) > 32)
+-			se_mask[se] |= cu_mask[(cu_index / 32) + 1]
+-					<< (32 - (cu_index % 32));
+-		se_mask[se] &= (1 << cu_per_sh) - 1;
+-		cu_index += cu_per_sh;
+-	}
+-	m->compute_static_thread_mgmt_se0 = se_mask[0];
+-	m->compute_static_thread_mgmt_se1 = se_mask[1];
+-	m->compute_static_thread_mgmt_se2 = se_mask[2];
+-	m->compute_static_thread_mgmt_se3 = se_mask[3];
+-
+-	pr_debug("Update cu mask to %#x %#x %#x %#x\n",
+-		m->compute_static_thread_mgmt_se0,
+-		m->compute_static_thread_mgmt_se1,
+-		m->compute_static_thread_mgmt_se2,
+-		m->compute_static_thread_mgmt_se3);
+-}
+-
+-static void set_priority(struct cik_mqd *m, struct queue_properties *q)
+-{
+-	m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+-	m->cp_hqd_queue_priority = q->priority;
+-	m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 &
+-				(~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) |
+-				(spi_priority_map[q->priority] <<
+-				COMPUTE_PGM_RSRC1__PRIORITY__SHIFT);
+-}
+-
+ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ 		struct queue_properties *q)
+@@ -142,6 +74,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+ 	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+ 
++	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN;
++	/* Although WinKFD writes this, I suspect it should not be necessary */
++	m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE;
++
+ 	m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS |
+ 				QUANTUM_DURATION(10);
+ 
+@@ -154,15 +90,12 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 	 * 1 = CS_MEDIUM (typically between HP3D and GFX
+ 	 * 2 = CS_HIGH (typically above HP3D)
+ 	 */
+-	set_priority(m, q);
++	m->cp_hqd_pipe_priority = 1;
++	m->cp_hqd_queue_priority = 15;
+ 
+ 	if (q->format == KFD_QUEUE_FORMAT_AQL)
+ 		m->cp_hqd_iq_rptr = AQL_ENABLE;
+ 
+-	if (priv_cp_queues)
+-		m->cp_hqd_pq_control |=
+-			1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT;
+-
+ 	*mqd = m;
+ 	if (gart_addr)
+ 		*gart_addr = addr;
+@@ -216,7 +149,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
+ {
+ 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+-	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
++	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
+ 
+ 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+ 					  (uint32_t __user *)p->write_ptr,
+@@ -227,30 +160,24 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 			 uint32_t pipe_id, uint32_t queue_id,
+ 			 struct queue_properties *p, struct mm_struct *mms)
+ {
+-	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+-					       (uint32_t __user *)p->write_ptr,
+-					       mms);
++	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
+ }
+ 
+-static int __update_mqd(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q, unsigned int atc_bit)
++static int update_mqd(struct mqd_manager *mm, void *mqd,
++			struct queue_properties *q)
+ {
+ 	struct cik_mqd *m;
+ 
+ 	m = get_mqd(mqd);
+ 	m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE |
+-				DEFAULT_MIN_AVAIL_SIZE;
+-	m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE;
+-	if (atc_bit) {
+-		m->cp_hqd_pq_control |= PQ_ATC_EN;
+-		m->cp_hqd_ib_control |= IB_ATC_EN;
+-	}
++				DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN;
+ 
+ 	/*
+ 	 * Calculating queue size which is log base 2 of actual queue size -1
+ 	 * dwords and another -1 for ffs
+ 	 */
+-	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
++	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
++								- 1 - 1;
+ 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+ 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+ 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+@@ -262,37 +189,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 	if (q->format == KFD_QUEUE_FORMAT_AQL)
+ 		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
+ 
+-	update_cu_mask(mm, mqd, q);
+-	set_priority(m, q);
+-
+-	q->is_active = (q->queue_size > 0 &&
++	q->is_active = false;
++	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
++			q->queue_percent > 0) {
++		q->is_active = true;
++	}
+ 
+ 	return 0;
+ }
+ 
+-static int update_mqd(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	return __update_mqd(mm, mqd, q, 1);
+-}
+-
+-static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	return __update_mqd(mm, mqd, q, 0);
+-}
+-
+ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 				struct queue_properties *q)
+ {
+ 	struct cik_sdma_rlc_registers *m;
+ 
+ 	m = get_sdma_mqd(mqd);
+-	m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4)
+-			<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
++	m->sdma_rlc_rb_cntl = ffs(q->queue_size / sizeof(unsigned int)) <<
++			SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+ 			q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+ 			1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+ 			6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+@@ -301,18 +215,24 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+ 	m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+ 	m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ 	m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->sdma_rlc_doorbell =
+-		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
++	m->sdma_rlc_doorbell = q->doorbell_off <<
++			SDMA0_RLC0_DOORBELL__OFFSET__SHIFT |
++			1 << SDMA0_RLC0_DOORBELL__ENABLE__SHIFT;
+ 
+ 	m->sdma_rlc_virtual_addr = q->sdma_vm_addr;
+ 
+ 	m->sdma_engine_id = q->sdma_engine_id;
+ 	m->sdma_queue_id = q->sdma_queue_id;
+ 
+-	q->is_active = (q->queue_size > 0 &&
++	q->is_active = false;
++	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
++			q->queue_percent > 0) {
++		m->sdma_rlc_rb_cntl |=
++				1 << SDMA0_RLC0_RB_CNTL__RB_ENABLE__SHIFT;
++
++		q->is_active = true;
++	}
+ 
+ 	return 0;
+ }
+@@ -407,7 +327,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ 	 * 1 = CS_MEDIUM (typically between HP3D and GFX
+ 	 * 2 = CS_HIGH (typically above HP3D)
+ 	 */
+-	set_priority(m, q);
++	m->cp_hqd_pipe_priority = 1;
++	m->cp_hqd_queue_priority = 15;
+ 
+ 	*mqd = m;
+ 	if (gart_addr)
+@@ -432,42 +353,37 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ 	 * Calculating queue size which is log base 2 of actual queue
+ 	 * size -1 dwords
+ 	 */
+-	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
++	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
++								- 1 - 1;
+ 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+ 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+ 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->cp_hqd_pq_doorbell_control = DOORBELL_OFFSET(q->doorbell_off);
++	m->cp_hqd_pq_doorbell_control = DOORBELL_EN |
++					DOORBELL_OFFSET(q->doorbell_off);
+ 
+ 	m->cp_hqd_vmid = q->vmid;
+ 
+-	q->is_active = (q->queue_size > 0 &&
++	m->cp_hqd_active = 0;
++	q->is_active = false;
++	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
+-
+-	set_priority(m, q);
+-	return 0;
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
++			q->queue_percent > 0) {
++		m->cp_hqd_active = 1;
++		q->is_active = true;
++	}
+ 
+-static int debugfs_show_mqd(struct seq_file *m, void *data)
+-{
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct cik_mqd), false);
+ 	return 0;
+ }
+ 
+-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
++struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+ {
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct cik_sdma_rlc_registers), false);
+-	return 0;
+-}
++	struct cik_sdma_rlc_registers *m;
+ 
+-#endif
++	m = (struct cik_sdma_rlc_registers *)mqd;
+ 
++	return m;
++}
+ 
+ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev)
+@@ -477,7 +393,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ 		return NULL;
+ 
+-	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
++	mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ 	if (!mqd)
+ 		return NULL;
+ 
+@@ -492,9 +408,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+ 		break;
+ 	case KFD_MQD_TYPE_HIQ:
+ 		mqd->init_mqd = init_mqd_hiq;
+@@ -503,9 +416,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_hiq;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+ 		break;
+ 	case KFD_MQD_TYPE_SDMA:
+ 		mqd->init_mqd = init_mqd_sdma;
+@@ -514,9 +424,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_sdma;
+ 		mqd->destroy_mqd = destroy_mqd_sdma;
+ 		mqd->is_occupied = is_occupied_sdma;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+-#endif
+ 		break;
+ 	default:
+ 		kfree(mqd);
+@@ -526,15 +433,3 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 	return mqd;
+ }
+ 
+-struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
+-			struct kfd_dev *dev)
+-{
+-	struct mqd_manager *mqd;
+-
+-	mqd = mqd_manager_init_cik(type, dev);
+-	if (!mqd)
+-		return NULL;
+-	if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
+-		mqd->update_mqd = update_mqd_hawaii;
+-	return mqd;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+deleted file mode 100644
+index 25a20e1..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
++++ /dev/null
+@@ -1,524 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-#include <linux/printk.h>
+-#include <linux/slab.h>
+-#include <linux/uaccess.h>
+-#include "kfd_priv.h"
+-#include "kfd_mqd_manager.h"
+-#include "v9_structs.h"
+-#include "gc/gc_9_0_offset.h"
+-#include "gc/gc_9_0_sh_mask.h"
+-#include "sdma0/sdma0_4_0_sh_mask.h"
+-
+-static inline struct v9_mqd *get_mqd(void *mqd)
+-{
+-	return (struct v9_mqd *)mqd;
+-}
+-
+-static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
+-{
+-	return (struct v9_sdma_mqd *)mqd;
+-}
+-
+-static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	struct v9_mqd *m;
+-	struct kfd_cu_info cu_info;
+-	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+-	uint32_t cu_mask_count = q->cu_mask_count;
+-	const uint32_t *cu_mask = q->cu_mask;
+-	int se, cu_per_sh, cu_index, i;
+-
+-	if (cu_mask_count == 0)
+-		return;
+-
+-	m = get_mqd(mqd);
+-	m->compute_static_thread_mgmt_se0 = 0;
+-	m->compute_static_thread_mgmt_se1 = 0;
+-	m->compute_static_thread_mgmt_se2 = 0;
+-	m->compute_static_thread_mgmt_se3 = 0;
+-
+-	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+-
+-	/* If # CU mask bits > # CUs, set it to the # of CUs */
+-	if (cu_mask_count > cu_info.cu_active_number)
+-		cu_mask_count = cu_info.cu_active_number;
+-
+-	cu_index = 0;
+-	for (se = 0; se < cu_info.num_shader_engines; se++) {
+-		cu_per_sh = 0;
+-
+-		/* Get the number of CUs on this Shader Engine */
+-		for (i = 0; i < 4; i++)
+-			cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]);
+-
+-		se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32);
+-		if ((cu_per_sh + (cu_index % 32)) > 32)
+-			se_mask[se] |= cu_mask[(cu_index / 32) + 1]
+-					<< (32 - (cu_index % 32));
+-		se_mask[se] &= (1 << cu_per_sh) - 1;
+-		cu_index += cu_per_sh;
+-	}
+-	m->compute_static_thread_mgmt_se0 = se_mask[0];
+-	m->compute_static_thread_mgmt_se1 = se_mask[1];
+-	m->compute_static_thread_mgmt_se2 = se_mask[2];
+-	m->compute_static_thread_mgmt_se3 = se_mask[3];
+-
+-	pr_debug("update cu mask to %#x %#x %#x %#x\n",
+-		m->compute_static_thread_mgmt_se0,
+-		m->compute_static_thread_mgmt_se1,
+-		m->compute_static_thread_mgmt_se2,
+-		m->compute_static_thread_mgmt_se3);
+-}
+-
+-static int init_mqd(struct mqd_manager *mm, void **mqd,
+-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+-			struct queue_properties *q)
+-{
+-	int retval;
+-	uint64_t addr;
+-	struct v9_mqd *m;
+-	struct kfd_dev *kfd = mm->dev;
+-
+-	/* From V9,  for CWSR, the control stack is located on the next page
+-	 * boundary after the mqd, we will use the gtt allocation function
+-	 * instead of sub-allocation function.
+-	 */
+-	if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
+-		*mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+-		if (!*mqd_mem_obj)
+-			return -ENOMEM;
+-		retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd,
+-			ALIGN(q->ctl_stack_size, PAGE_SIZE) +
+-				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
+-			&((*mqd_mem_obj)->gtt_mem),
+-			&((*mqd_mem_obj)->gpu_addr),
+-			(void *)&((*mqd_mem_obj)->cpu_ptr));
+-	} else
+-		retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
+-				mqd_mem_obj);
+-	if (retval != 0)
+-		return -ENOMEM;
+-
+-	m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
+-	addr = (*mqd_mem_obj)->gpu_addr;
+-
+-	memset(m, 0, sizeof(struct v9_mqd));
+-
+-	m->header = 0xC0310800;
+-	m->compute_pipelinestat_enable = 1;
+-	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+-	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+-	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+-	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+-
+-	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+-			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+-
+-	m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+-
+-	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+-	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+-
+-	m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+-			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+-			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+-
+-	m->cp_hqd_pipe_priority = 1;
+-	m->cp_hqd_queue_priority = 15;
+-
+-	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+-		m->cp_hqd_aql_control =
+-			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+-	}
+-
+-	if (q->tba_addr) {
+-		m->compute_pgm_rsrc2 |=
+-			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+-	}
+-
+-	if (mm->dev->cwsr_enabled) {
+-		m->cp_hqd_persistent_state |=
+-			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+-		m->cp_hqd_ctx_save_base_addr_lo =
+-			lower_32_bits(q->ctx_save_restore_area_address);
+-		m->cp_hqd_ctx_save_base_addr_hi =
+-			upper_32_bits(q->ctx_save_restore_area_address);
+-		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+-		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+-		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+-		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+-	}
+-
+-	if (priv_cp_queues)
+-		m->cp_hqd_pq_control |=
+-			1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT;
+-
+-	*mqd = m;
+-	if (gart_addr)
+-		*gart_addr = addr;
+-	retval = mm->update_mqd(mm, m, q);
+-
+-	return retval;
+-}
+-
+-static int load_mqd(struct mqd_manager *mm, void *mqd,
+-			uint32_t pipe_id, uint32_t queue_id,
+-			struct queue_properties *p, struct mm_struct *mms)
+-{
+-	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+-	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+-
+-	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+-					  (uint32_t __user *)p->write_ptr,
+-					  wptr_shift, 0, mms);
+-}
+-
+-static int update_mqd(struct mqd_manager *mm, void *mqd,
+-		      struct queue_properties *q)
+-{
+-	struct v9_mqd *m;
+-
+-	m = get_mqd(mqd);
+-
+-	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+-	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
+-	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+-
+-	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+-	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+-
+-	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+-	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+-	m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+-
+-	m->cp_hqd_pq_doorbell_control =
+-		q->doorbell_off <<
+-			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+-	pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+-			m->cp_hqd_pq_doorbell_control);
+-
+-	m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT |
+-			1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT;
+-
+-	/*
+-	 * HW does not clamp this field correctly. Maximum EOP queue size
+-	 * is constrained by per-SE EOP done signal count, which is 8-bit.
+-	 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+-	 * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+-	 * is safe, giving a maximum field value of 0xA.
+-	 */
+-	m->cp_hqd_eop_control = min(0xA,
+-		order_base_2(q->eop_ring_buffer_size / 4) - 1);
+-	m->cp_hqd_eop_base_addr_lo =
+-			lower_32_bits(q->eop_ring_buffer_address >> 8);
+-	m->cp_hqd_eop_base_addr_hi =
+-			upper_32_bits(q->eop_ring_buffer_address >> 8);
+-
+-	m->cp_hqd_iq_timer = 0;
+-
+-	m->cp_hqd_vmid = q->vmid;
+-
+-	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+-		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+-				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+-				1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT |
+-				1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT;
+-		m->cp_hqd_pq_doorbell_control |=
+-			1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+-	}
+-	if (mm->dev->cwsr_enabled)
+-		m->cp_hqd_ctx_save_control = 0;
+-
+-	update_cu_mask(mm, mqd, q);
+-
+-	q->is_active = (q->queue_size > 0 &&
+-			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
+-
+-	return 0;
+-}
+-
+-
+-static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+-			enum kfd_preempt_type type,
+-			unsigned int timeout, uint32_t pipe_id,
+-			uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_destroy
+-		(mm->dev->kgd, mqd, type, timeout,
+-		pipe_id, queue_id);
+-}
+-
+-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+-			struct kfd_mem_obj *mqd_mem_obj)
+-{
+-	struct kfd_dev *kfd = mm->dev;
+-
+-	if (mqd_mem_obj->gtt_mem) {
+-		kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem);
+-		kfree(mqd_mem_obj);
+-	} else {
+-		kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+-	}
+-}
+-
+-static bool is_occupied(struct mqd_manager *mm, void *mqd,
+-			uint64_t queue_address,	uint32_t pipe_id,
+-			uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_is_occupied(
+-		mm->dev->kgd, queue_address,
+-		pipe_id, queue_id);
+-}
+-
+-static int get_wave_state(struct mqd_manager *mm, void *mqd,
+-			  void __user *ctl_stack,
+-			  u32 *ctl_stack_used_size,
+-			  u32 *save_area_used_size)
+-{
+-	struct v9_mqd *m;
+-
+-	/* Control stack is located one page after MQD. */
+-	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+-
+-	m = get_mqd(mqd);
+-
+-	*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+-		m->cp_hqd_cntl_stack_offset;
+-	*save_area_used_size = m->cp_hqd_wg_state_offset -
+-		m->cp_hqd_cntl_stack_size;
+-
+-	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+-		return -EFAULT;
+-
+-	return 0;
+-}
+-
+-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+-			struct queue_properties *q)
+-{
+-	struct v9_mqd *m;
+-	int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+-
+-	if (retval != 0)
+-		return retval;
+-
+-	m = get_mqd(*mqd);
+-
+-	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+-			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+-
+-	return retval;
+-}
+-
+-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	struct v9_mqd *m;
+-	int retval = update_mqd(mm, mqd, q);
+-
+-	if (retval != 0)
+-		return retval;
+-
+-	/* TODO: what's the point? update_mqd already does this. */
+-	m = get_mqd(mqd);
+-	m->cp_hqd_vmid = q->vmid;
+-	return retval;
+-}
+-
+-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+-		struct queue_properties *q)
+-{
+-	int retval;
+-	struct v9_sdma_mqd *m;
+-
+-
+-	retval = kfd_gtt_sa_allocate(mm->dev,
+-			sizeof(struct v9_sdma_mqd),
+-			mqd_mem_obj);
+-
+-	if (retval != 0)
+-		return -ENOMEM;
+-
+-	m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+-
+-	memset(m, 0, sizeof(struct v9_sdma_mqd));
+-
+-	*mqd = m;
+-	if (gart_addr)
+-		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+-
+-	retval = mm->update_mqd(mm, m, q);
+-
+-	return retval;
+-}
+-
+-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		struct kfd_mem_obj *mqd_mem_obj)
+-{
+-	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+-}
+-
+-static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		uint32_t pipe_id, uint32_t queue_id,
+-		struct queue_properties *p, struct mm_struct *mms)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+-					       (uint32_t __user *)p->write_ptr,
+-					       mms);
+-}
+-
+-#define SDMA_RLC_DUMMY_DEFAULT 0xf
+-
+-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		struct queue_properties *q)
+-{
+-	struct v9_sdma_mqd *m;
+-
+-	m = get_sdma_mqd(mqd);
+-	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+-		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+-		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+-		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+-		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+-
+-	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+-	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+-	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+-	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->sdmax_rlcx_doorbell_offset =
+-		q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
+-
+-	m->sdma_engine_id = q->sdma_engine_id;
+-	m->sdma_queue_id = q->sdma_queue_id;
+-	m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+-
+-	q->is_active = (q->queue_size > 0 &&
+-			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
+-
+-	return 0;
+-}
+-
+-/*
+- *  * preempt type here is ignored because there is only one way
+- *  * to preempt sdma queue
+- */
+-static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		enum kfd_preempt_type type,
+-		unsigned int timeout, uint32_t pipe_id,
+-		uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+-}
+-
+-static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+-		uint64_t queue_address, uint32_t pipe_id,
+-		uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-static int debugfs_show_mqd(struct seq_file *m, void *data)
+-{
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct v9_mqd), false);
+-	return 0;
+-}
+-
+-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+-{
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct v9_sdma_mqd), false);
+-	return 0;
+-}
+-
+-#endif
+-
+-struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+-		struct kfd_dev *dev)
+-{
+-	struct mqd_manager *mqd;
+-
+-	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+-		return NULL;
+-
+-	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
+-	if (!mqd)
+-		return NULL;
+-
+-	mqd->dev = dev;
+-
+-	switch (type) {
+-	case KFD_MQD_TYPE_CP:
+-	case KFD_MQD_TYPE_COMPUTE:
+-		mqd->init_mqd = init_mqd;
+-		mqd->uninit_mqd = uninit_mqd;
+-		mqd->load_mqd = load_mqd;
+-		mqd->update_mqd = update_mqd;
+-		mqd->destroy_mqd = destroy_mqd;
+-		mqd->is_occupied = is_occupied;
+-		mqd->get_wave_state = get_wave_state;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+-		break;
+-	case KFD_MQD_TYPE_HIQ:
+-		mqd->init_mqd = init_mqd_hiq;
+-		mqd->uninit_mqd = uninit_mqd;
+-		mqd->load_mqd = load_mqd;
+-		mqd->update_mqd = update_mqd_hiq;
+-		mqd->destroy_mqd = destroy_mqd;
+-		mqd->is_occupied = is_occupied;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+-		break;
+-	case KFD_MQD_TYPE_SDMA:
+-		mqd->init_mqd = init_mqd_sdma;
+-		mqd->uninit_mqd = uninit_mqd_sdma;
+-		mqd->load_mqd = load_mqd_sdma;
+-		mqd->update_mqd = update_mqd_sdma;
+-		mqd->destroy_mqd = destroy_mqd_sdma;
+-		mqd->is_occupied = is_occupied_sdma;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+-#endif
+-		break;
+-	default:
+-		kfree(mqd);
+-		return NULL;
+-	}
+-
+-	return mqd;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+index 9bf1212..73cbfe1 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+@@ -30,7 +30,6 @@
+ #include "vi_structs.h"
+ #include "gca/gfx_8_0_sh_mask.h"
+ #include "gca/gfx_8_0_enum.h"
+-#include "oss/oss_3_0_sh_mask.h"
+ 
+ #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
+ 
+@@ -39,73 +38,6 @@ static inline struct vi_mqd *get_mqd(void *mqd)
+ 	return (struct vi_mqd *)mqd;
+ }
+ 
+-static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+-{
+-	return (struct vi_sdma_mqd *)mqd;
+-}
+-
+-static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	struct vi_mqd *m;
+-	struct kfd_cu_info cu_info;
+-	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+-	uint32_t cu_mask_count = q->cu_mask_count;
+-	const uint32_t *cu_mask = q->cu_mask;
+-	int se, cu_per_sh, cu_index, i;
+-
+-	if (cu_mask_count == 0)
+-		return;
+-
+-	m = get_mqd(mqd);
+-	m->compute_static_thread_mgmt_se0 = 0;
+-	m->compute_static_thread_mgmt_se1 = 0;
+-	m->compute_static_thread_mgmt_se2 = 0;
+-	m->compute_static_thread_mgmt_se3 = 0;
+-
+-	mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info);
+-
+-	/* If # CU mask bits > # CUs, set it to the # of CUs */
+-	if (cu_mask_count > cu_info.cu_active_number)
+-		cu_mask_count = cu_info.cu_active_number;
+-
+-	cu_index = 0;
+-	for (se = 0; se < cu_info.num_shader_engines; se++) {
+-		cu_per_sh = 0;
+-
+-		/* Get the number of CUs on this Shader Engine */
+-		for (i = 0; i < 4; i++)
+-			cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]);
+-
+-		se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32);
+-		if ((cu_per_sh + (cu_index % 32)) > 32)
+-			se_mask[se] |= cu_mask[(cu_index / 32) + 1]
+-					<< (32 - (cu_index % 32));
+-		se_mask[se] &= (1 << cu_per_sh) - 1;
+-		cu_index += cu_per_sh;
+-	}
+-	m->compute_static_thread_mgmt_se0 = se_mask[0];
+-	m->compute_static_thread_mgmt_se1 = se_mask[1];
+-	m->compute_static_thread_mgmt_se2 = se_mask[2];
+-	m->compute_static_thread_mgmt_se3 = se_mask[3];
+-
+-	pr_debug("Update cu mask to %#x %#x %#x %#x\n",
+-		m->compute_static_thread_mgmt_se0,
+-		m->compute_static_thread_mgmt_se1,
+-		m->compute_static_thread_mgmt_se2,
+-		m->compute_static_thread_mgmt_se3);
+-}
+-
+-static void set_priority(struct vi_mqd *m, struct queue_properties *q)
+-{
+-	m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+-	m->cp_hqd_queue_priority = q->priority;
+-	m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 &
+-				(~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) |
+-				(spi_priority_map[q->priority] <<
+-				COMPUTE_PGM_RSRC1__PRIORITY__SHIFT);
+-}
+-
+ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ 			struct queue_properties *q)
+@@ -144,38 +76,14 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
+ 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+ 			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+ 
+-	set_priority(m, q);
++	m->cp_hqd_pipe_priority = 1;
++	m->cp_hqd_queue_priority = 15;
++
+ 	m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT;
+ 
+ 	if (q->format == KFD_QUEUE_FORMAT_AQL)
+ 		m->cp_hqd_iq_rptr = 1;
+ 
+-	if (q->tba_addr) {
+-		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
+-		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
+-		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
+-		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
+-		m->compute_pgm_rsrc2 |=
+-			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+-	}
+-
+-	if (mm->dev->cwsr_enabled) {
+-		m->cp_hqd_persistent_state |=
+-			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+-		m->cp_hqd_ctx_save_base_addr_lo =
+-			lower_32_bits(q->ctx_save_restore_area_address);
+-		m->cp_hqd_ctx_save_base_addr_hi =
+-			upper_32_bits(q->ctx_save_restore_area_address);
+-		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+-		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+-		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+-		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+-	}
+-
+-	if (priv_cp_queues)
+-		m->cp_hqd_pq_control |=
+-			1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT;
+-
+ 	*mqd = m;
+ 	if (gart_addr)
+ 		*gart_addr = addr;
+@@ -190,7 +98,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
+ {
+ 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+ 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+-	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
++	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
+ 
+ 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+ 					  (uint32_t __user *)p->write_ptr,
+@@ -208,7 +116,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT |
+ 			atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT |
+ 			mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT;
+-	m->cp_hqd_pq_control |=	order_base_2(q->queue_size / 4) - 1;
++	m->cp_hqd_pq_control |=
++			ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+ 	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+ 
+ 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+@@ -216,8 +125,6 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 
+ 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+ 	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+-	m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+ 
+ 	m->cp_hqd_pq_doorbell_control =
+ 		q->doorbell_off <<
+@@ -240,7 +147,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 	 * is safe, giving a maximum field value of 0xA.
+ 	 */
+ 	m->cp_hqd_eop_control |= min(0xA,
+-		order_base_2(q->eop_ring_buffer_size / 4) - 1);
++		ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
+ 	m->cp_hqd_eop_base_addr_lo =
+ 			lower_32_bits(q->eop_ring_buffer_address >> 8);
+ 	m->cp_hqd_eop_base_addr_hi =
+@@ -255,18 +162,13 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
+ 		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+ 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
+ 	}
+-	if (mm->dev->cwsr_enabled)
+-		m->cp_hqd_ctx_save_control =
+-			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+-			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+-
+-	update_cu_mask(mm, mqd, q);
+-	set_priority(m, q);
+ 
+-	q->is_active = (q->queue_size > 0 &&
++	q->is_active = false;
++	if (q->queue_size > 0 &&
+ 			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
++			q->queue_percent > 0) {
++		q->is_active = true;
++	}
+ 
+ 	return 0;
+ }
+@@ -278,12 +180,6 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
+ 	return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
+ }
+ 
+-static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
+-			struct queue_properties *q)
+-{
+-	return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+-}
+-
+ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+ 			enum kfd_preempt_type type,
+ 			unsigned int timeout, uint32_t pipe_id,
+@@ -309,28 +205,6 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd,
+ 		pipe_id, queue_id);
+ }
+ 
+-static int get_wave_state(struct mqd_manager *mm, void *mqd,
+-			  void __user *ctl_stack,
+-			  u32 *ctl_stack_used_size,
+-			  u32 *save_area_used_size)
+-{
+-	struct vi_mqd *m;
+-
+-	m = get_mqd(mqd);
+-
+-	*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+-		m->cp_hqd_cntl_stack_offset;
+-	*save_area_used_size = m->cp_hqd_wg_state_offset -
+-		m->cp_hqd_cntl_stack_size;
+-
+-	/* Control stack is not copied to user mode for GFXv8 because
+-	 * it's part of the context save area that is already
+-	 * accessible to user mode
+-	 */
+-
+-	return 0;
+-}
+-
+ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+ 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+ 			struct queue_properties *q)
+@@ -363,118 +237,6 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+ 	return retval;
+ }
+ 
+-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+-		struct queue_properties *q)
+-{
+-	int retval;
+-	struct vi_sdma_mqd *m;
+-
+-
+-	retval = kfd_gtt_sa_allocate(mm->dev,
+-			sizeof(struct vi_sdma_mqd),
+-			mqd_mem_obj);
+-
+-	if (retval != 0)
+-		return -ENOMEM;
+-
+-	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+-
+-	memset(m, 0, sizeof(struct vi_sdma_mqd));
+-
+-	*mqd = m;
+-	if (gart_addr)
+-		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+-
+-	retval = mm->update_mqd(mm, m, q);
+-
+-	return retval;
+-}
+-
+-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		struct kfd_mem_obj *mqd_mem_obj)
+-{
+-	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+-}
+-
+-static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		uint32_t pipe_id, uint32_t queue_id,
+-		struct queue_properties *p, struct mm_struct *mms)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+-					       (uint32_t __user *)p->write_ptr,
+-					       mms);
+-}
+-
+-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		struct queue_properties *q)
+-{
+-	struct vi_sdma_mqd *m;
+-
+-	m = get_sdma_mqd(mqd);
+-	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+-		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+-		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+-		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+-		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+-
+-	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+-	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+-	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+-	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+-	m->sdmax_rlcx_doorbell =
+-		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
+-
+-	m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
+-
+-	m->sdma_engine_id = q->sdma_engine_id;
+-	m->sdma_queue_id = q->sdma_queue_id;
+-
+-	q->is_active = (q->queue_size > 0 &&
+-			q->queue_address != 0 &&
+-			q->queue_percent > 0 &&
+-			!q->is_evicted);
+-
+-	return 0;
+-}
+-
+-/*
+- *  * preempt type here is ignored because there is only one way
+- *  * to preempt sdma queue
+- */
+-static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+-		enum kfd_preempt_type type,
+-		unsigned int timeout, uint32_t pipe_id,
+-		uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+-}
+-
+-static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+-		uint64_t queue_address, uint32_t pipe_id,
+-		uint32_t queue_id)
+-{
+-	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-static int debugfs_show_mqd(struct seq_file *m, void *data)
+-{
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct vi_mqd), false);
+-	return 0;
+-}
+-
+-static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+-{
+-	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     data, sizeof(struct vi_sdma_mqd), false);
+-	return 0;
+-}
+-
+-#endif
+-
+ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev)
+ {
+@@ -483,7 +245,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+ 		return NULL;
+ 
+-	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
++	mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
+ 	if (!mqd)
+ 		return NULL;
+ 
+@@ -498,10 +260,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-		mqd->get_wave_state = get_wave_state;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+ 		break;
+ 	case KFD_MQD_TYPE_HIQ:
+ 		mqd->init_mqd = init_mqd_hiq;
+@@ -510,20 +268,8 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		mqd->update_mqd = update_mqd_hiq;
+ 		mqd->destroy_mqd = destroy_mqd;
+ 		mqd->is_occupied = is_occupied;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd;
+-#endif
+ 		break;
+ 	case KFD_MQD_TYPE_SDMA:
+-		mqd->init_mqd = init_mqd_sdma;
+-		mqd->uninit_mqd = uninit_mqd_sdma;
+-		mqd->load_mqd = load_mqd_sdma;
+-		mqd->update_mqd = update_mqd_sdma;
+-		mqd->destroy_mqd = destroy_mqd_sdma;
+-		mqd->is_occupied = is_occupied_sdma;
+-#if defined(CONFIG_DEBUG_FS)
+-		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+-#endif
+ 		break;
+ 	default:
+ 		kfree(mqd);
+@@ -532,17 +278,3 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 
+ 	return mqd;
+ }
+-
+-struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+-			struct kfd_dev *dev)
+-{
+-	struct mqd_manager *mqd;
+-
+-	mqd = mqd_manager_init_vi(type, dev);
+-	if (!mqd)
+-		return NULL;
+-	if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE))
+-		mqd->update_mqd = update_mqd_tonga;
+-	return mqd;
+-}
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index 98c89d2..1d31260 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -26,6 +26,7 @@
+ #include "kfd_device_queue_manager.h"
+ #include "kfd_kernel_queue.h"
+ #include "kfd_priv.h"
++#include "kfd_pm4_headers_vi.h"
+ #include "kfd_pm4_opcodes.h"
+ 
+ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
+@@ -38,40 +39,38 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes,
+ 	*wptr = temp;
+ }
+ 
++static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
++{
++	union PM4_MES_TYPE_3_HEADER header;
++
++	header.u32All = 0;
++	header.opcode = opcode;
++	header.count = packet_size/sizeof(uint32_t) - 2;
++	header.type = PM4_TYPE_3;
++
++	return header.u32All;
++}
++
+ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 				unsigned int *rlib_size,
+ 				bool *over_subscription)
+ {
+-	unsigned int process_count, queue_count, compute_queue_count;
++	unsigned int process_count, queue_count;
+ 	unsigned int map_queue_size;
+-	unsigned int max_proc_per_quantum = 1;
+-
+-	struct kfd_dev	*dev = pm->dqm->dev;
+ 
+ 	process_count = pm->dqm->processes_count;
+ 	queue_count = pm->dqm->queue_count;
+-	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
+-
+-	/* check if there is over subscription
+-	 * Note: the arbitration between the number of VMIDs and
+-	 * hws_max_conc_proc has been done in
+-	 * kgd2kfd_device_init().
+-	 */
+ 
++	/* check if there is over subscription*/
+ 	*over_subscription = false;
+-
+-	if (dev->max_proc_per_quantum > 1)
+-		max_proc_per_quantum = dev->max_proc_per_quantum;
+-
+-	if ((process_count > max_proc_per_quantum) ||
+-		compute_queue_count > get_queues_num(pm->dqm)) {
++	if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) {
+ 		*over_subscription = true;
+ 		pr_debug("Over subscribed runlist\n");
+ 	}
+ 
+-	map_queue_size = pm->pmf->get_map_queues_packet_size();
++	map_queue_size = sizeof(struct pm4_mes_map_queues);
+ 	/* calculate run list ib allocation size */
+-	*rlib_size = process_count * pm->pmf->get_map_process_packet_size() +
++	*rlib_size = process_count * sizeof(struct pm4_mes_map_process) +
+ 		     queue_count * map_queue_size;
+ 
+ 	/*
+@@ -79,7 +78,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 	 * when over subscription
+ 	 */
+ 	if (*over_subscription)
+-		*rlib_size += pm->pmf->get_runlist_packet_size();
++		*rlib_size += sizeof(struct pm4_mes_runlist);
+ 
+ 	pr_debug("runlist ib size %d\n", *rlib_size);
+ }
+@@ -97,14 +96,12 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
+ 
+ 	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
+ 
+-	mutex_lock(&pm->lock);
+-
+ 	retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
+ 					&pm->ib_buffer_obj);
+ 
+ 	if (retval) {
+ 		pr_err("Failed to allocate runlist IB\n");
+-		goto out;
++		return retval;
+ 	}
+ 
+ 	*(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr;
+@@ -112,12 +109,131 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
+ 
+ 	memset(*rl_buffer, 0, *rl_buffer_size);
+ 	pm->allocated = true;
+-
+-out:
+-	mutex_unlock(&pm->lock);
+ 	return retval;
+ }
+ 
++static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
++			uint64_t ib, size_t ib_size_in_dwords, bool chain)
++{
++	struct pm4_mes_runlist *packet;
++
++	if (WARN_ON(!ib))
++		return -EFAULT;
++
++	packet = (struct pm4_mes_runlist *)buffer;
++
++	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
++	packet->header.u32All = build_pm4_header(IT_RUN_LIST,
++						sizeof(struct pm4_mes_runlist));
++
++	packet->bitfields4.ib_size = ib_size_in_dwords;
++	packet->bitfields4.chain = chain ? 1 : 0;
++	packet->bitfields4.offload_polling = 0;
++	packet->bitfields4.valid = 1;
++	packet->ordinal2 = lower_32_bits(ib);
++	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
++
++	return 0;
++}
++
++static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer,
++				struct qcm_process_device *qpd)
++{
++	struct pm4_mes_map_process *packet;
++	struct queue *cur;
++	uint32_t num_queues;
++
++	packet = (struct pm4_mes_map_process *)buffer;
++
++	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
++
++	packet->header.u32All = build_pm4_header(IT_MAP_PROCESS,
++					sizeof(struct pm4_mes_map_process));
++	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
++	packet->bitfields2.process_quantum = 1;
++	packet->bitfields2.pasid = qpd->pqm->process->pasid;
++	packet->bitfields3.page_table_base = qpd->page_table_base;
++	packet->bitfields10.gds_size = qpd->gds_size;
++	packet->bitfields10.num_gws = qpd->num_gws;
++	packet->bitfields10.num_oac = qpd->num_oac;
++	num_queues = 0;
++	list_for_each_entry(cur, &qpd->queues_list, list)
++		num_queues++;
++	packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : num_queues;
++
++	packet->sh_mem_config = qpd->sh_mem_config;
++	packet->sh_mem_bases = qpd->sh_mem_bases;
++	packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base;
++	packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit;
++
++	/* TODO: scratch support */
++	packet->sh_hidden_private_base_vmid = 0;
++
++	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
++	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
++
++	return 0;
++}
++
++static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer,
++		struct queue *q, bool is_static)
++{
++	struct pm4_mes_map_queues *packet;
++	bool use_static = is_static;
++
++	packet = (struct pm4_mes_map_queues *)buffer;
++	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
++
++	packet->header.u32All = build_pm4_header(IT_MAP_QUEUES,
++						sizeof(struct pm4_mes_map_queues));
++	packet->bitfields2.alloc_format =
++		alloc_format__mes_map_queues__one_per_pipe_vi;
++	packet->bitfields2.num_queues = 1;
++	packet->bitfields2.queue_sel =
++		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
++
++	packet->bitfields2.engine_sel =
++		engine_sel__mes_map_queues__compute_vi;
++	packet->bitfields2.queue_type =
++		queue_type__mes_map_queues__normal_compute_vi;
++
++	switch (q->properties.type) {
++	case KFD_QUEUE_TYPE_COMPUTE:
++		if (use_static)
++			packet->bitfields2.queue_type =
++		queue_type__mes_map_queues__normal_latency_static_queue_vi;
++		break;
++	case KFD_QUEUE_TYPE_DIQ:
++		packet->bitfields2.queue_type =
++			queue_type__mes_map_queues__debug_interface_queue_vi;
++		break;
++	case KFD_QUEUE_TYPE_SDMA:
++		packet->bitfields2.engine_sel =
++				engine_sel__mes_map_queues__sdma0_vi;
++		use_static = false; /* no static queues under SDMA */
++		break;
++	default:
++		WARN(1, "queue type %d", q->properties.type);
++		return -EINVAL;
++	}
++	packet->bitfields3.doorbell_offset =
++			q->properties.doorbell_off;
++
++	packet->mqd_addr_lo =
++			lower_32_bits(q->gart_mqd_addr);
++
++	packet->mqd_addr_hi =
++			upper_32_bits(q->gart_mqd_addr);
++
++	packet->wptr_addr_lo =
++			lower_32_bits((uint64_t)q->properties.write_ptr);
++
++	packet->wptr_addr_hi =
++			upper_32_bits((uint64_t)q->properties.write_ptr);
++
++	return 0;
++}
++
+ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 				struct list_head *queues,
+ 				uint64_t *rl_gpu_addr,
+@@ -140,7 +256,6 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 		return retval;
+ 
+ 	*rl_size_bytes = alloc_size_bytes;
+-	pm->ib_size_bytes = alloc_size_bytes;
+ 
+ 	pr_debug("Building runlist ib process count: %d queues count %d\n",
+ 		pm->dqm->processes_count, pm->dqm->queue_count);
+@@ -155,12 +270,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			return -ENOMEM;
+ 		}
+ 
+-		retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd);
++		retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd);
+ 		if (retval)
+ 			return retval;
+ 
+ 		proccesses_mapped++;
+-		inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(),
++		inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process),
+ 				alloc_size_bytes);
+ 
+ 		list_for_each_entry(kq, &qpd->priv_queue_list, list) {
+@@ -170,7 +285,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			pr_debug("static_queue, mapping kernel q %d, is debug status %d\n",
+ 				kq->queue->queue, qpd->is_debug);
+ 
+-			retval = pm->pmf->map_queues(pm,
++			retval = pm_create_map_queue(pm,
+ 						&rl_buffer[rl_wptr],
+ 						kq->queue,
+ 						qpd->is_debug);
+@@ -178,7 +293,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 				return retval;
+ 
+ 			inc_wptr(&rl_wptr,
+-				pm->pmf->get_map_queues_packet_size(),
++				sizeof(struct pm4_mes_map_queues),
+ 				alloc_size_bytes);
+ 		}
+ 
+@@ -189,15 +304,16 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 			pr_debug("static_queue, mapping user queue %d, is debug status %d\n",
+ 				q->queue, qpd->is_debug);
+ 
+-			retval = pm->pmf->map_queues(pm,
++			retval = pm_create_map_queue(pm,
+ 						&rl_buffer[rl_wptr],
+ 						q,
+ 						qpd->is_debug);
++
+ 			if (retval)
+ 				return retval;
+ 
+ 			inc_wptr(&rl_wptr,
+-				pm->pmf->get_map_queues_packet_size(),
++				sizeof(struct pm4_mes_map_queues),
+ 				alloc_size_bytes);
+ 		}
+ 	}
+@@ -205,7 +321,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 	pr_debug("Finished map process and queues to runlist\n");
+ 
+ 	if (is_over_subscription)
+-		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
++		retval = pm_create_runlist(pm, &rl_buffer[rl_wptr],
+ 					*rl_gpu_addr,
+ 					alloc_size_bytes / sizeof(uint32_t),
+ 					true);
+@@ -217,8 +333,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
+ 	return retval;
+ }
+ 
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+-		uint16_t fw_ver)
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
+ {
+ 	pm->dqm = dqm;
+ 	mutex_init(&pm->lock);
+@@ -229,26 +344,6 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+ 	}
+ 	pm->allocated = false;
+ 
+-	switch (pm->dqm->dev->device_info->asic_family) {
+-	case CHIP_KAVERI:
+-	case CHIP_HAWAII:
+-		kfd_pm_func_init_cik(pm, fw_ver);
+-		break;
+-	case CHIP_CARRIZO:
+-	case CHIP_TONGA:
+-	case CHIP_FIJI:
+-	case CHIP_POLARIS10:
+-	case CHIP_POLARIS11:
+-		kfd_pm_func_init_vi(pm, fw_ver);
+-		break;
+-	case CHIP_VEGA10:
+-	case CHIP_RAVEN:
+-		kfd_pm_func_init_v9(pm, fw_ver);
+-		break;
+-	default:
+-		BUG();
+-	}
+-
+ 	return 0;
+ }
+ 
+@@ -261,25 +356,38 @@ void pm_uninit(struct packet_manager *pm)
+ int pm_send_set_resources(struct packet_manager *pm,
+ 				struct scheduling_resources *res)
+ {
+-	uint32_t *buffer, size;
++	struct pm4_mes_set_resources *packet;
+ 	int retval = 0;
+ 
+-	size = pm->pmf->get_set_resources_packet_size();
+ 	mutex_lock(&pm->lock);
+ 	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+-					size / sizeof(uint32_t),
+-					(unsigned int **)&buffer);
+-	if (!buffer) {
++					sizeof(*packet) / sizeof(uint32_t),
++					(unsigned int **)&packet);
++	if (!packet) {
+ 		pr_err("Failed to allocate buffer on kernel queue\n");
+ 		retval = -ENOMEM;
+ 		goto out;
+ 	}
+ 
+-	retval = pm->pmf->set_resources(pm, buffer, res);
+-	if (!retval)
+-		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+-	else
+-		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
++	memset(packet, 0, sizeof(struct pm4_mes_set_resources));
++	packet->header.u32All = build_pm4_header(IT_SET_RESOURCES,
++					sizeof(struct pm4_mes_set_resources));
++
++	packet->bitfields2.queue_type =
++			queue_type__mes_set_resources__hsa_interface_queue_hiq;
++	packet->bitfields2.vmid_mask = res->vmid_mask;
++	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY;
++	packet->bitfields7.oac_mask = res->oac_mask;
++	packet->bitfields8.gds_heap_base = res->gds_heap_base;
++	packet->bitfields8.gds_heap_size = res->gds_heap_size;
++
++	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
++	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
++
++	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
++	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
++
++	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ 
+ out:
+ 	mutex_unlock(&pm->lock);
+@@ -301,8 +409,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+ 
+ 	pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr);
+ 
+-	packet_size_dwords = pm->pmf->get_runlist_packet_size() /
+-		sizeof(uint32_t);
++	packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t);
+ 	mutex_lock(&pm->lock);
+ 
+ 	retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+@@ -310,8 +417,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+ 	if (retval)
+ 		goto fail_acquire_packet_buffer;
+ 
+-	retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr,
+-				rl_ib_size / sizeof(uint32_t), false);
++	retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr,
++					rl_ib_size / sizeof(uint32_t), false);
+ 	if (retval)
+ 		goto fail_create_runlist;
+ 
+@@ -333,59 +440,122 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
+ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+ 			uint32_t fence_value)
+ {
+-	uint32_t *buffer, size;
+-	int retval = 0;
++	int retval;
++	struct pm4_mes_query_status *packet;
+ 
+ 	if (WARN_ON(!fence_address))
+ 		return -EFAULT;
+ 
+-	size = pm->pmf->get_query_status_packet_size();
+ 	mutex_lock(&pm->lock);
+-	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+-			size / sizeof(uint32_t), (unsigned int **)&buffer);
+-	if (!buffer) {
+-		pr_err("Failed to allocate buffer on kernel queue\n");
+-		retval = -ENOMEM;
+-		goto out;
+-	}
++	retval = pm->priv_queue->ops.acquire_packet_buffer(
++			pm->priv_queue,
++			sizeof(struct pm4_mes_query_status) / sizeof(uint32_t),
++			(unsigned int **)&packet);
++	if (retval)
++		goto fail_acquire_packet_buffer;
+ 
+-	retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
+-	if (!retval)
+-		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+-	else
+-		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
++	packet->header.u32All = build_pm4_header(IT_QUERY_STATUS,
++					sizeof(struct pm4_mes_query_status));
+ 
+-out:
++	packet->bitfields2.context_id = 0;
++	packet->bitfields2.interrupt_sel =
++			interrupt_sel__mes_query_status__completion_status;
++	packet->bitfields2.command =
++			command__mes_query_status__fence_only_after_write_ack;
++
++	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
++	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
++	packet->data_hi = upper_32_bits((uint64_t)fence_value);
++	packet->data_lo = lower_32_bits((uint64_t)fence_value);
++
++	pm->priv_queue->ops.submit_packet(pm->priv_queue);
++
++fail_acquire_packet_buffer:
+ 	mutex_unlock(&pm->lock);
+ 	return retval;
+ }
+ 
+ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter filter,
++			enum kfd_preempt_type_filter mode,
+ 			uint32_t filter_param, bool reset,
+ 			unsigned int sdma_engine)
+ {
+-	uint32_t *buffer, size;
+-	int retval = 0;
++	int retval;
++	uint32_t *buffer;
++	struct pm4_mes_unmap_queues *packet;
+ 
+-	size = pm->pmf->get_unmap_queues_packet_size();
+ 	mutex_lock(&pm->lock);
+-	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
+-			size / sizeof(uint32_t), (unsigned int **)&buffer);
+-	if (!buffer) {
+-		pr_err("Failed to allocate buffer on kernel queue\n");
+-		retval = -ENOMEM;
+-		goto out;
++	retval = pm->priv_queue->ops.acquire_packet_buffer(
++			pm->priv_queue,
++			sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t),
++			&buffer);
++	if (retval)
++		goto err_acquire_packet_buffer;
++
++	packet = (struct pm4_mes_unmap_queues *)buffer;
++	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
++	pr_debug("static_queue: unmapping queues: mode is %d , reset is %d , type is %d\n",
++		mode, reset, type);
++	packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES,
++					sizeof(struct pm4_mes_unmap_queues));
++	switch (type) {
++	case KFD_QUEUE_TYPE_COMPUTE:
++	case KFD_QUEUE_TYPE_DIQ:
++		packet->bitfields2.engine_sel =
++			engine_sel__mes_unmap_queues__compute;
++		break;
++	case KFD_QUEUE_TYPE_SDMA:
++		packet->bitfields2.engine_sel =
++			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
++		break;
++	default:
++		WARN(1, "queue type %d", type);
++		retval = -EINVAL;
++		goto err_invalid;
+ 	}
+ 
+-	retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param,
+-				       reset, sdma_engine);
+-	if (!retval)
+-		pm->priv_queue->ops.submit_packet(pm->priv_queue);
++	if (reset)
++		packet->bitfields2.action =
++				action__mes_unmap_queues__reset_queues;
+ 	else
+-		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
++		packet->bitfields2.action =
++				action__mes_unmap_queues__preempt_queues;
++
++	switch (mode) {
++	case KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE:
++		packet->bitfields2.queue_sel =
++				queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
++		packet->bitfields2.num_queues = 1;
++		packet->bitfields3b.doorbell_offset0 = filter_param;
++		break;
++	case KFD_PREEMPT_TYPE_FILTER_BY_PASID:
++		packet->bitfields2.queue_sel =
++				queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
++		packet->bitfields3a.pasid = filter_param;
++		break;
++	case KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES:
++		packet->bitfields2.queue_sel =
++				queue_sel__mes_unmap_queues__unmap_all_queues;
++		break;
++	case KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES:
++		/* in this case, we do not preempt static queues */
++		packet->bitfields2.queue_sel =
++				queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
++		break;
++	default:
++		WARN(1, "filter %d", mode);
++		retval = -EINVAL;
++		goto err_invalid;
++	}
++
++	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+ 
+-out:
++	mutex_unlock(&pm->lock);
++	return 0;
++
++err_invalid:
++	pm->priv_queue->ops.rollback_packet(pm->priv_queue);
++err_acquire_packet_buffer:
+ 	mutex_unlock(&pm->lock);
+ 	return retval;
+ }
+@@ -399,18 +569,3 @@ void pm_release_ib(struct packet_manager *pm)
+ 	}
+ 	mutex_unlock(&pm->lock);
+ }
+-
+-int pm_debugfs_runlist(struct seq_file *m, void *data)
+-{
+-	struct packet_manager *pm = data;
+-
+-	if (!pm->allocated) {
+-		seq_puts(m, "  No active runlist\n");
+-		return 0;
+-	}
+-
+-	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
+-		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
+-
+-	return 0;
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+index 15fff44..1e06de0 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+@@ -20,64 +20,78 @@
+  * OTHER DEALINGS IN THE SOFTWARE.
+  */
+ 
++#include <linux/slab.h>
+ #include <linux/types.h>
+ #include "kfd_priv.h"
+ 
+-static unsigned int pasid_bits = 16;
+-static const struct kfd2kgd_calls *kfd2kgd;
++static unsigned long *pasid_bitmap;
++static unsigned int pasid_limit;
++static DEFINE_MUTEX(pasid_mutex);
++
++int kfd_pasid_init(void)
++{
++	pasid_limit = KFD_MAX_NUM_OF_PROCESSES;
++
++	pasid_bitmap = kcalloc(BITS_TO_LONGS(pasid_limit), sizeof(long),
++				GFP_KERNEL);
++	if (!pasid_bitmap)
++		return -ENOMEM;
++
++	set_bit(0, pasid_bitmap); /* PASID 0 is reserved. */
++
++	return 0;
++}
++
++void kfd_pasid_exit(void)
++{
++	kfree(pasid_bitmap);
++}
+ 
+ bool kfd_set_pasid_limit(unsigned int new_limit)
+ {
+-	if (new_limit < 2)
+-		return false;
+-
+-	if (new_limit < (1U << pasid_bits)) {
+-		if (kfd2kgd)
+-			/* We've already allocated user PASIDs, too late to
+-			 * change the limit
+-			 */
+-			return false;
+-
+-		while (new_limit < (1U << pasid_bits))
+-			pasid_bits--;
++	if (new_limit < pasid_limit) {
++		bool ok;
++
++		mutex_lock(&pasid_mutex);
++
++		/* ensure that no pasids >= new_limit are in-use */
++		ok = (find_next_bit(pasid_bitmap, pasid_limit, new_limit) ==
++								pasid_limit);
++		if (ok)
++			pasid_limit = new_limit;
++
++		mutex_unlock(&pasid_mutex);
++
++		return ok;
+ 	}
+ 
+ 	return true;
+ }
+ 
+-unsigned int kfd_get_pasid_limit(void)
++inline unsigned int kfd_get_pasid_limit(void)
+ {
+-	return 1U << pasid_bits;
++	return pasid_limit;
+ }
+ 
+ unsigned int kfd_pasid_alloc(void)
+ {
+-	int r;
+-
+-	/* Find the first best KFD device for calling KGD */
+-	if (!kfd2kgd) {
+-		struct kfd_dev *dev = NULL;
+-		unsigned int i = 0;
+-
+-		while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) {
+-			if (dev && dev->kfd2kgd) {
+-				kfd2kgd = dev->kfd2kgd;
+-				break;
+-			}
+-			i++;
+-		}
+-
+-		if (!kfd2kgd)
+-			return false;
+-	}
++	unsigned int found;
++
++	mutex_lock(&pasid_mutex);
++
++	found = find_first_zero_bit(pasid_bitmap, pasid_limit);
++	if (found == pasid_limit)
++		found = 0;
++	else
++		set_bit(found, pasid_bitmap);
+ 
+-	r = kfd2kgd->alloc_pasid(pasid_bits);
++	mutex_unlock(&pasid_mutex);
+ 
+-	return r > 0 ? r : 0;
++	return found;
+ }
+ 
+ void kfd_pasid_free(unsigned int pasid)
+ {
+-	if (kfd2kgd)
+-		kfd2kgd->free_pasid(pasid);
++	if (!WARN_ON(pasid == 0 || pasid >= pasid_limit))
++		clear_bit(pasid, pasid_bitmap);
+ }
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
+deleted file mode 100644
+index 543ed83..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
++++ /dev/null
+@@ -1,513 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-
+-/* NOTE:
+- *
+- * This file contains logic to dynamically detect and enable PeerDirect
+- * suppor. PeerDirect support is delivered e.g. as part of OFED
+- * from Mellanox. Because we are not able to rely on the fact that the
+- * corresponding OFED will be installed we should:
+- *  - copy PeerDirect definitions locally to avoid dependency on
+- *    corresponding header file
+- *  - try dynamically detect address of PeerDirect function
+- *    pointers.
+- *
+- * If dynamic detection failed then PeerDirect support should be
+- * enabled using the standard PeerDirect bridge driver from:
+- * https://github.com/RadeonOpenCompute/ROCnRDMA
+- *
+- *
+- * Logic to support PeerDirect relies only on official public API to be
+- * non-intrusive as much as possible.
+- *
+- **/
+-
+-#include <linux/device.h>
+-#include <linux/export.h>
+-#include <linux/pid.h>
+-#include <linux/err.h>
+-#include <linux/slab.h>
+-#include <linux/scatterlist.h>
+-#include <linux/module.h>
+-
+-#include "kfd_priv.h"
+-
+-
+-
+-/* ----------------------- PeerDirect interface ------------------------------*/
+-
+-/*
+- * Copyright (c) 2013,  Mellanox Technologies. All rights reserved.
+- *
+- * This software is available to you under a choice of one of two
+- * licenses.  You may choose to be licensed under the terms of the GNU
+- * General Public License (GPL) Version 2, available from the file
+- * COPYING in the main directory of this source tree, or the
+- * OpenIB.org BSD license below:
+- *
+- *     Redistribution and use in source and binary forms, with or
+- *     without modification, are permitted provided that the following
+- *     conditions are met:
+- *
+- *      - Redistributions of source code must retain the above
+- *        copyright notice, this list of conditions and the following
+- *        disclaimer.
+- *
+- *      - Redistributions in binary form must reproduce the above
+- *        copyright notice, this list of conditions and the following
+- *        disclaimer in the documentation and/or other materials
+- *        provided with the distribution.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+- * SOFTWARE.
+- */
+-#define IB_PEER_MEMORY_NAME_MAX 64
+-#define IB_PEER_MEMORY_VER_MAX 16
+-
+-struct peer_memory_client {
+-	char	name[IB_PEER_MEMORY_NAME_MAX];
+-	char	version[IB_PEER_MEMORY_VER_MAX];
+-	/* acquire return code: 1-mine, 0-not mine */
+-	int (*acquire)(unsigned long addr, size_t size,
+-			void *peer_mem_private_data,
+-					char *peer_mem_name,
+-					void **client_context);
+-	int (*get_pages)(unsigned long addr,
+-			  size_t size, int write, int force,
+-			  struct sg_table *sg_head,
+-			  void *client_context, void *core_context);
+-	int (*dma_map)(struct sg_table *sg_head, void *client_context,
+-			struct device *dma_device, int dmasync, int *nmap);
+-	int (*dma_unmap)(struct sg_table *sg_head, void *client_context,
+-			   struct device  *dma_device);
+-	void (*put_pages)(struct sg_table *sg_head, void *client_context);
+-	unsigned long (*get_page_size)(void *client_context);
+-	void (*release)(void *client_context);
+-	void* (*get_context_private_data)(u64 peer_id);
+-	void (*put_context_private_data)(void *context);
+-};
+-
+-typedef int (*invalidate_peer_memory)(void *reg_handle,
+-					  void *core_context);
+-
+-void *ib_register_peer_memory_client(struct peer_memory_client *peer_client,
+-				  invalidate_peer_memory *invalidate_callback);
+-void ib_unregister_peer_memory_client(void *reg_handle);
+-
+-
+-/*------------------- PeerDirect bridge driver ------------------------------*/
+-
+-#define AMD_PEER_BRIDGE_DRIVER_VERSION	"1.0"
+-#define AMD_PEER_BRIDGE_DRIVER_NAME	"amdkfd"
+-
+-
+-static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client
+-							*peer_client,
+-					invalidate_peer_memory
+-							*invalidate_callback);
+-
+-static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle);
+-
+-static const struct amd_rdma_interface *rdma_interface;
+-
+-static invalidate_peer_memory ib_invalidate_callback;
+-static void *ib_reg_handle;
+-
+-struct amd_mem_context {
+-	uint64_t	va;
+-	uint64_t	size;
+-	struct pid	*pid;
+-
+-	struct amd_p2p_info  *p2p_info;
+-
+-	/* Flag that free callback was called */
+-	int free_callback_called;
+-
+-	/* Context received from PeerDirect call */
+-	void *core_context;
+-};
+-
+-
+-static void free_callback(void *client_priv)
+-{
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_priv;
+-
+-	pr_debug("data 0x%p\n", mem_context);
+-
+-	if (!mem_context) {
+-		pr_warn("Invalid client context\n");
+-		return;
+-	}
+-
+-	pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context);
+-
+-	/* Call back IB stack asking to invalidate memory */
+-	(*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context);
+-
+-	/* amdkfd will free resources when we return from this callback.
+-	 * Set flag to inform that there is nothing to do on "put_pages", etc.
+-	 */
+-	ACCESS_ONCE(mem_context->free_callback_called) = 1;
+-}
+-
+-
+-static int amd_acquire(unsigned long addr, size_t size,
+-			void *peer_mem_private_data,
+-			char *peer_mem_name, void **client_context)
+-{
+-	int ret;
+-	struct amd_mem_context *mem_context;
+-	struct pid *pid;
+-
+-	/* Get pointer to structure describing current process */
+-	pid = get_task_pid(current, PIDTYPE_PID);
+-
+-	pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n",
+-					addr, (unsigned int)size, pid);
+-
+-	/* Check if address is handled by AMD GPU driver */
+-	ret = rdma_interface->is_gpu_address(addr, pid);
+-
+-	if (!ret) {
+-		pr_debug("Not GPU Address\n");
+-		/* This is not GPU address */
+-		return 0;
+-	}
+-
+-	pr_debug("GPU address\n");
+-
+-	/* Initialize context used for operation with given address */
+-	mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL);
+-
+-	if (!mem_context)
+-		return 0;	/* Error case handled as not GPU address  */
+-
+-	mem_context->free_callback_called = 0;
+-	mem_context->va   = addr;
+-	mem_context->size = size;
+-
+-	/* Save PID. It is guaranteed that the function will be
+-	 * called in the correct process context as opposite to others.
+-	 */
+-	mem_context->pid  = pid;
+-
+-	pr_debug("Client context %p\n", mem_context);
+-
+-	/* Return pointer to allocated context */
+-	*client_context = mem_context;
+-
+-	/* Return 1 to inform that this address which will be handled
+-	 * by AMD GPU driver
+-	 */
+-	return 1;
+-}
+-
+-static int amd_get_pages(unsigned long addr, size_t size, int write, int force,
+-			  struct sg_table *sg_head,
+-			  void *client_context, void *core_context)
+-{
+-	int ret;
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n",
+-		addr, (unsigned int)size, core_context);
+-
+-	if (!mem_context) {
+-		pr_warn("Invalid client context");
+-		return -EINVAL;
+-	}
+-
+-	pr_debug("pid :0x%p\n", mem_context->pid);
+-
+-
+-	if (addr != mem_context->va) {
+-		pr_warn("Context address (0x%llx) is not the same\n",
+-			mem_context->va);
+-		return -EINVAL;
+-	}
+-
+-	if (size != mem_context->size) {
+-		pr_warn("Context size (0x%llx) is not the same\n",
+-			mem_context->size);
+-		return -EINVAL;
+-	}
+-
+-	ret = rdma_interface->get_pages(addr,
+-					size,
+-					mem_context->pid,
+-					&mem_context->p2p_info,
+-					free_callback,
+-					mem_context);
+-
+-	if (ret || !mem_context->p2p_info) {
+-		pr_err("Could not rdma::get_pages failure: %d\n", ret);
+-		return ret;
+-	}
+-
+-	mem_context->core_context = core_context;
+-
+-	/* Note: At this stage it is OK not to fill sg_table */
+-	return 0;
+-}
+-
+-
+-static int amd_dma_map(struct sg_table *sg_head, void *client_context,
+-			struct device *dma_device, int dmasync, int *nmap)
+-{
+-	/*
+-	 * NOTE/TODO:
+-	 * We could have potentially three cases for real memory
+-	 *	location:
+-	 *		- all memory in the local
+-	 *		- all memory in the system (RAM)
+-	 *		- memory is spread (s/g) between local and system.
+-	 *
+-	 *	In the case of all memory in the system we could use
+-	 *	iommu driver to build DMA addresses but not in the case
+-	 *	of local memory because currently iommu driver doesn't
+-	 *	deal with local/device memory addresses (it requires "struct
+-	 *	page").
+-	 *
+-	 *	Accordingly returning assumes that iommu funcutionality
+-	 *	should be disabled so we can assume that sg_table already
+-	 *	contains DMA addresses.
+-	 *
+-	 */
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("Context 0x%p, sg_head 0x%p\n",
+-			client_context, sg_head);
+-
+-	pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n",
+-			mem_context->pid,
+-			mem_context->va,
+-			mem_context->size);
+-
+-	if (!mem_context->p2p_info) {
+-		pr_err("No sg table were allocated\n");
+-		return -EINVAL;
+-	}
+-
+-	/* Copy information about previosly allocated sg_table */
+-	*sg_head = *mem_context->p2p_info->pages;
+-
+-	/* Return number of pages */
+-	*nmap = mem_context->p2p_info->pages->nents;
+-
+-	return 0;
+-}
+-
+-static int amd_dma_unmap(struct sg_table *sg_head, void *client_context,
+-			   struct device  *dma_device)
+-{
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("Context 0x%p, sg_table 0x%p\n",
+-			client_context, sg_head);
+-
+-	pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n",
+-			mem_context->pid,
+-			mem_context->va,
+-			mem_context->size);
+-
+-	/* Assume success */
+-	return 0;
+-}
+-static void amd_put_pages(struct sg_table *sg_head, void *client_context)
+-{
+-	int ret = 0;
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("sg_head %p client_context: 0x%p\n",
+-			sg_head, client_context);
+-	pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n",
+-			mem_context->pid,
+-			mem_context->va,
+-			mem_context->size);
+-
+-	pr_debug("mem_context->p2p_info %p\n",
+-				mem_context->p2p_info);
+-
+-	if (ACCESS_ONCE(mem_context->free_callback_called)) {
+-		pr_debug("Free callback was called\n");
+-		return;
+-	}
+-
+-	if (mem_context->p2p_info) {
+-		ret = rdma_interface->put_pages(&mem_context->p2p_info);
+-		mem_context->p2p_info = NULL;
+-
+-		if (ret)
+-			pr_err("Failure: %d (callback status %d)\n",
+-					ret, mem_context->free_callback_called);
+-	} else
+-		pr_err("Pointer to p2p info is null\n");
+-}
+-static unsigned long amd_get_page_size(void *client_context)
+-{
+-	unsigned long page_size;
+-	int result;
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("context: %p\n", client_context);
+-	pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n",
+-			mem_context->pid,
+-			mem_context->va,
+-			mem_context->size);
+-
+-
+-	result = rdma_interface->get_page_size(
+-				mem_context->va,
+-				mem_context->size,
+-				mem_context->pid,
+-				&page_size);
+-
+-	if (result) {
+-		pr_err("Could not get page size. %d\n", result);
+-		/* If we failed to get page size then do not know what to do.
+-		 * Let's return some default value
+-		 */
+-		return PAGE_SIZE;
+-	}
+-
+-	return page_size;
+-}
+-
+-static void amd_release(void *client_context)
+-{
+-	struct amd_mem_context *mem_context =
+-		(struct amd_mem_context *)client_context;
+-
+-	pr_debug("context: 0x%p\n", client_context);
+-	pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n",
+-			mem_context->pid,
+-			mem_context->va,
+-			mem_context->size);
+-
+-	kfree(mem_context);
+-}
+-
+-
+-static struct peer_memory_client amd_mem_client = {
+-	.acquire = amd_acquire,
+-	.get_pages = amd_get_pages,
+-	.dma_map = amd_dma_map,
+-	.dma_unmap = amd_dma_unmap,
+-	.put_pages = amd_put_pages,
+-	.get_page_size = amd_get_page_size,
+-	.release = amd_release,
+-	.get_context_private_data = NULL,
+-	.put_context_private_data = NULL,
+-};
+-
+-/** Initialize PeerDirect interface with RDMA Network stack.
+- *
+- *  Because network stack could potentially be loaded later we check
+- *  presence of PeerDirect when HSA process is created. If PeerDirect was
+- *  already initialized we do nothing otherwise try to detect and register.
+- */
+-void kfd_init_peer_direct(void)
+-{
+-	int result;
+-
+-	if (pfn_ib_unregister_peer_memory_client) {
+-		pr_debug("PeerDirect support was already initialized\n");
+-		return;
+-	}
+-
+-	pr_debug("Try to initialize PeerDirect support\n");
+-
+-	pfn_ib_register_peer_memory_client =
+-		(void *(*)(struct peer_memory_client *,
+-			  invalidate_peer_memory *))
+-		symbol_request(ib_register_peer_memory_client);
+-
+-	pfn_ib_unregister_peer_memory_client = (void (*)(void *))
+-		symbol_request(ib_unregister_peer_memory_client);
+-
+-	if (!pfn_ib_register_peer_memory_client ||
+-		!pfn_ib_unregister_peer_memory_client) {
+-		pr_debug("PeerDirect interface was not detected\n");
+-		/* Do cleanup */
+-		kfd_close_peer_direct();
+-		return;
+-	}
+-
+-	result = amdkfd_query_rdma_interface(&rdma_interface);
+-
+-	if (result < 0) {
+-		pr_err("Cannot get RDMA Interface (result = %d)\n", result);
+-		return;
+-	}
+-
+-	strcpy(amd_mem_client.name,    AMD_PEER_BRIDGE_DRIVER_NAME);
+-	strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION);
+-
+-	ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client,
+-						&ib_invalidate_callback);
+-
+-	if (!ib_reg_handle) {
+-		pr_err("Cannot register peer memory client\n");
+-		/* Do cleanup */
+-		kfd_close_peer_direct();
+-		return;
+-	}
+-
+-	pr_info("PeerDirect support was initialized successfully\n");
+-}
+-
+-/**
+- * Close connection with PeerDirect interface with RDMA Network stack.
+- *
+- */
+-void kfd_close_peer_direct(void)
+-{
+-	if (pfn_ib_unregister_peer_memory_client) {
+-		if (ib_reg_handle)
+-			pfn_ib_unregister_peer_memory_client(ib_reg_handle);
+-
+-		symbol_put(ib_unregister_peer_memory_client);
+-	}
+-
+-	if (pfn_ib_register_peer_memory_client)
+-		symbol_put(ib_register_peer_memory_client);
+-
+-
+-	/* Reset pointers to be safe */
+-	pfn_ib_unregister_peer_memory_client = NULL;
+-	pfn_ib_register_peer_memory_client   = NULL;
+-	ib_reg_handle = NULL;
+-}
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+deleted file mode 100644
+index ddad9be..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
++++ /dev/null
+@@ -1,583 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- *
+- */
+-
+-#ifndef F32_MES_PM4_PACKETS_H
+-#define F32_MES_PM4_PACKETS_H
+-
+-#ifndef PM4_MES_HEADER_DEFINED
+-#define PM4_MES_HEADER_DEFINED
+-union PM4_MES_TYPE_3_HEADER {
+-	struct {
+-		uint32_t reserved1 : 8; /* < reserved */
+-		uint32_t opcode    : 8; /* < IT opcode */
+-		uint32_t count     : 14;/* < number of DWORDs - 1 in the
+-					 *   information body.
+-					 */
+-		uint32_t type      : 2; /* < packet identifier.
+-					 *   It should be 3 for type 3 packets
+-					 */
+-	};
+-	uint32_t u32All;
+-};
+-#endif /* PM4_MES_HEADER_DEFINED */
+-
+-/*--------------------MES_SET_RESOURCES--------------------*/
+-
+-#ifndef PM4_MES_SET_RESOURCES_DEFINED
+-#define PM4_MES_SET_RESOURCES_DEFINED
+-enum mes_set_resources_queue_type_enum {
+-	queue_type__mes_set_resources__kernel_interface_queue_kiq = 0,
+-	queue_type__mes_set_resources__hsa_interface_queue_hiq = 1,
+-	queue_type__mes_set_resources__hsa_debug_interface_queue = 4
+-};
+-
+-
+-struct pm4_mes_set_resources {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER	header;		/* header */
+-		uint32_t			ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t vmid_mask:16;
+-			uint32_t unmap_latency:8;
+-			uint32_t reserved1:5;
+-			enum mes_set_resources_queue_type_enum queue_type:3;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	uint32_t queue_mask_lo;
+-	uint32_t queue_mask_hi;
+-	uint32_t gws_mask_lo;
+-	uint32_t gws_mask_hi;
+-
+-	union {
+-		struct {
+-			uint32_t oac_mask:16;
+-			uint32_t reserved2:16;
+-		} bitfields7;
+-		uint32_t ordinal7;
+-	};
+-
+-	union {
+-		struct {
+-		uint32_t gds_heap_base:6;
+-		uint32_t reserved3:5;
+-		uint32_t gds_heap_size:6;
+-		uint32_t reserved4:15;
+-		} bitfields8;
+-		uint32_t ordinal8;
+-	};
+-
+-};
+-#endif
+-
+-/*--------------------MES_RUN_LIST--------------------*/
+-
+-#ifndef PM4_MES_RUN_LIST_DEFINED
+-#define PM4_MES_RUN_LIST_DEFINED
+-
+-struct pm4_mes_runlist {
+-	union {
+-	    union PM4_MES_TYPE_3_HEADER   header;            /* header */
+-	    uint32_t            ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved1:2;
+-			uint32_t ib_base_lo:30;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	uint32_t ib_base_hi;
+-
+-	union {
+-		struct {
+-			uint32_t ib_size:20;
+-			uint32_t chain:1;
+-			uint32_t offload_polling:1;
+-			uint32_t reserved2:1;
+-			uint32_t valid:1;
+-			uint32_t process_cnt:4;
+-			uint32_t reserved3:4;
+-		} bitfields4;
+-		uint32_t ordinal4;
+-	};
+-
+-};
+-#endif
+-
+-/*--------------------MES_MAP_PROCESS--------------------*/
+-
+-#ifndef PM4_MES_MAP_PROCESS_DEFINED
+-#define PM4_MES_MAP_PROCESS_DEFINED
+-
+-struct pm4_mes_map_process {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER header;	/* header */
+-		uint32_t ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t pasid:16;
+-			uint32_t reserved1:8;
+-			uint32_t diq_enable:1;
+-			uint32_t process_quantum:7;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	uint32_t vm_context_page_table_base_addr_lo32;
+-
+-	uint32_t vm_context_page_table_base_addr_hi32;
+-
+-	uint32_t sh_mem_bases;
+-
+-	uint32_t sh_mem_config;
+-
+-	uint32_t sq_shader_tba_lo;
+-
+-	uint32_t sq_shader_tba_hi;
+-
+-	uint32_t sq_shader_tma_lo;
+-
+-	uint32_t sq_shader_tma_hi;
+-
+-	uint32_t reserved6;
+-
+-	uint32_t gds_addr_lo;
+-
+-	uint32_t gds_addr_hi;
+-
+-	union {
+-		struct {
+-			uint32_t num_gws:6;
+-			uint32_t reserved7:1;
+-			uint32_t sdma_enable:1;
+-			uint32_t num_oac:4;
+-			uint32_t reserved8:4;
+-			uint32_t gds_size:6;
+-			uint32_t num_queues:10;
+-		} bitfields14;
+-		uint32_t ordinal14;
+-	};
+-
+-	uint32_t completion_signal_lo;
+-
+-	uint32_t completion_signal_hi;
+-
+-};
+-
+-#endif
+-
+-/*--------------------MES_MAP_PROCESS_VM--------------------*/
+-
+-#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED
+-#define PM4_MES_MAP_PROCESS_VM_DEFINED
+-
+-struct PM4_MES_MAP_PROCESS_VM {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER header;	/* header */
+-		uint32_t ordinal1;
+-	};
+-
+-	uint32_t reserved1;
+-
+-	uint32_t vm_context_cntl;
+-
+-	uint32_t reserved2;
+-
+-	uint32_t vm_context_page_table_end_addr_lo32;
+-
+-	uint32_t vm_context_page_table_end_addr_hi32;
+-
+-	uint32_t vm_context_page_table_start_addr_lo32;
+-
+-	uint32_t vm_context_page_table_start_addr_hi32;
+-
+-	uint32_t reserved3;
+-
+-	uint32_t reserved4;
+-
+-	uint32_t reserved5;
+-
+-	uint32_t reserved6;
+-
+-	uint32_t reserved7;
+-
+-	uint32_t reserved8;
+-
+-	uint32_t completion_signal_lo32;
+-
+-	uint32_t completion_signal_hi32;
+-
+-};
+-#endif
+-
+-/*--------------------MES_MAP_QUEUES--------------------*/
+-
+-#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED
+-#define PM4_MES_MAP_QUEUES_VI_DEFINED
+-enum mes_map_queues_queue_sel_enum {
+-	queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0,
+-queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1
+-};
+-
+-enum mes_map_queues_queue_type_enum {
+-	queue_type__mes_map_queues__normal_compute_vi = 0,
+-	queue_type__mes_map_queues__debug_interface_queue_vi = 1,
+-	queue_type__mes_map_queues__normal_latency_static_queue_vi = 2,
+-queue_type__mes_map_queues__low_latency_static_queue_vi = 3
+-};
+-
+-enum mes_map_queues_alloc_format_enum {
+-	alloc_format__mes_map_queues__one_per_pipe_vi = 0,
+-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
+-};
+-
+-enum mes_map_queues_engine_sel_enum {
+-	engine_sel__mes_map_queues__compute_vi = 0,
+-	engine_sel__mes_map_queues__sdma0_vi = 2,
+-	engine_sel__mes_map_queues__sdma1_vi = 3
+-};
+-
+-
+-struct pm4_mes_map_queues {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+-		uint32_t            ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved1:4;
+-			enum mes_map_queues_queue_sel_enum queue_sel:2;
+-			uint32_t reserved2:15;
+-			enum mes_map_queues_queue_type_enum queue_type:3;
+-			enum mes_map_queues_alloc_format_enum alloc_format:2;
+-			enum mes_map_queues_engine_sel_enum engine_sel:3;
+-			uint32_t num_queues:3;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved3:1;
+-			uint32_t check_disable:1;
+-			uint32_t doorbell_offset:26;
+-			uint32_t reserved4:4;
+-		} bitfields3;
+-		uint32_t ordinal3;
+-	};
+-
+-	uint32_t mqd_addr_lo;
+-	uint32_t mqd_addr_hi;
+-	uint32_t wptr_addr_lo;
+-	uint32_t wptr_addr_hi;
+-};
+-#endif
+-
+-/*--------------------MES_QUERY_STATUS--------------------*/
+-
+-#ifndef PM4_MES_QUERY_STATUS_DEFINED
+-#define PM4_MES_QUERY_STATUS_DEFINED
+-enum mes_query_status_interrupt_sel_enum {
+-	interrupt_sel__mes_query_status__completion_status = 0,
+-	interrupt_sel__mes_query_status__process_status = 1,
+-	interrupt_sel__mes_query_status__queue_status = 2
+-};
+-
+-enum mes_query_status_command_enum {
+-	command__mes_query_status__interrupt_only = 0,
+-	command__mes_query_status__fence_only_immediate = 1,
+-	command__mes_query_status__fence_only_after_write_ack = 2,
+-	command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3
+-};
+-
+-enum mes_query_status_engine_sel_enum {
+-	engine_sel__mes_query_status__compute = 0,
+-	engine_sel__mes_query_status__sdma0_queue = 2,
+-	engine_sel__mes_query_status__sdma1_queue = 3
+-};
+-
+-struct pm4_mes_query_status {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+-		uint32_t            ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t context_id:28;
+-			enum mes_query_status_interrupt_sel_enum	interrupt_sel:2;
+-			enum mes_query_status_command_enum command:2;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t pasid:16;
+-			uint32_t reserved1:16;
+-		} bitfields3a;
+-		struct {
+-			uint32_t reserved2:2;
+-			uint32_t doorbell_offset:26;
+-			enum mes_query_status_engine_sel_enum engine_sel:3;
+-			uint32_t reserved3:1;
+-		} bitfields3b;
+-		uint32_t ordinal3;
+-	};
+-
+-	uint32_t addr_lo;
+-	uint32_t addr_hi;
+-	uint32_t data_lo;
+-	uint32_t data_hi;
+-};
+-#endif
+-
+-/*--------------------MES_UNMAP_QUEUES--------------------*/
+-
+-#ifndef PM4_MES_UNMAP_QUEUES_DEFINED
+-#define PM4_MES_UNMAP_QUEUES_DEFINED
+-enum mes_unmap_queues_action_enum {
+-	action__mes_unmap_queues__preempt_queues = 0,
+-	action__mes_unmap_queues__reset_queues = 1,
+-	action__mes_unmap_queues__disable_process_queues = 2,
+-	action__mes_unmap_queues__reserved = 3
+-};
+-
+-enum mes_unmap_queues_queue_sel_enum {
+-	queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0,
+-	queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1,
+-	queue_sel__mes_unmap_queues__unmap_all_queues = 2,
+-	queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3
+-};
+-
+-enum mes_unmap_queues_engine_sel_enum {
+-	engine_sel__mes_unmap_queues__compute = 0,
+-	engine_sel__mes_unmap_queues__sdma0 = 2,
+-	engine_sel__mes_unmap_queues__sdmal = 3
+-};
+-
+-struct pm4_mes_unmap_queues {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER   header;            /* header */
+-		uint32_t            ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			enum mes_unmap_queues_action_enum action:2;
+-			uint32_t reserved1:2;
+-			enum mes_unmap_queues_queue_sel_enum queue_sel:2;
+-			uint32_t reserved2:20;
+-			enum mes_unmap_queues_engine_sel_enum engine_sel:3;
+-			uint32_t num_queues:3;
+-		} bitfields2;
+-		uint32_t ordinal2;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t pasid:16;
+-			uint32_t reserved3:16;
+-		} bitfields3a;
+-		struct {
+-			uint32_t reserved4:2;
+-			uint32_t doorbell_offset0:26;
+-			int32_t reserved5:4;
+-		} bitfields3b;
+-		uint32_t ordinal3;
+-	};
+-
+-	union {
+-	struct {
+-			uint32_t reserved6:2;
+-			uint32_t doorbell_offset1:26;
+-			uint32_t reserved7:4;
+-		} bitfields4;
+-		uint32_t ordinal4;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved8:2;
+-			uint32_t doorbell_offset2:26;
+-			uint32_t reserved9:4;
+-		} bitfields5;
+-		uint32_t ordinal5;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved10:2;
+-			uint32_t doorbell_offset3:26;
+-			uint32_t reserved11:4;
+-		} bitfields6;
+-		uint32_t ordinal6;
+-	};
+-};
+-#endif
+-
+-#ifndef PM4_MEC_RELEASE_MEM_DEFINED
+-#define PM4_MEC_RELEASE_MEM_DEFINED
+-
+-enum mec_release_mem_event_index_enum {
+-	event_index__mec_release_mem__end_of_pipe = 5,
+-	event_index__mec_release_mem__shader_done = 6
+-};
+-
+-enum mec_release_mem_cache_policy_enum {
+-	cache_policy__mec_release_mem__lru = 0,
+-	cache_policy__mec_release_mem__stream = 1
+-};
+-
+-enum mec_release_mem_pq_exe_status_enum {
+-	pq_exe_status__mec_release_mem__default = 0,
+-	pq_exe_status__mec_release_mem__phase_update = 1
+-};
+-
+-enum mec_release_mem_dst_sel_enum {
+-	dst_sel__mec_release_mem__memory_controller = 0,
+-	dst_sel__mec_release_mem__tc_l2 = 1,
+-	dst_sel__mec_release_mem__queue_write_pointer_register = 2,
+-	dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3
+-};
+-
+-enum mec_release_mem_int_sel_enum {
+-	int_sel__mec_release_mem__none = 0,
+-	int_sel__mec_release_mem__send_interrupt_only = 1,
+-	int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2,
+-	int_sel__mec_release_mem__send_data_after_write_confirm = 3,
+-	int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4,
+-	int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5,
+-	int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6
+-};
+-
+-enum mec_release_mem_data_sel_enum {
+-	data_sel__mec_release_mem__none = 0,
+-	data_sel__mec_release_mem__send_32_bit_low = 1,
+-	data_sel__mec_release_mem__send_64_bit_data = 2,
+-	data_sel__mec_release_mem__send_gpu_clock_counter = 3,
+-	data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4,
+-	data_sel__mec_release_mem__store_gds_data_to_memory = 5
+-};
+-
+-struct pm4_mec_release_mem {
+-	union {
+-		union PM4_MES_TYPE_3_HEADER header;     /*header */
+-		unsigned int ordinal1;
+-	};
+-
+-	union {
+-		struct {
+-			unsigned int event_type:6;
+-			unsigned int reserved1:2;
+-			enum mec_release_mem_event_index_enum event_index:4;
+-			unsigned int tcl1_vol_action_ena:1;
+-			unsigned int tc_vol_action_ena:1;
+-			unsigned int reserved2:1;
+-			unsigned int tc_wb_action_ena:1;
+-			unsigned int tcl1_action_ena:1;
+-			unsigned int tc_action_ena:1;
+-			uint32_t reserved3:1;
+-			uint32_t tc_nc_action_ena:1;
+-			uint32_t tc_wc_action_ena:1;
+-			uint32_t tc_md_action_ena:1;
+-			uint32_t reserved4:3;
+-			enum mec_release_mem_cache_policy_enum cache_policy:2;
+-			uint32_t reserved5:2;
+-			enum mec_release_mem_pq_exe_status_enum pq_exe_status:1;
+-			uint32_t reserved6:2;
+-		} bitfields2;
+-		unsigned int ordinal2;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved7:16;
+-			enum mec_release_mem_dst_sel_enum dst_sel:2;
+-			uint32_t reserved8:6;
+-			enum mec_release_mem_int_sel_enum int_sel:3;
+-			uint32_t reserved9:2;
+-			enum mec_release_mem_data_sel_enum data_sel:3;
+-		} bitfields3;
+-		unsigned int ordinal3;
+-	};
+-
+-	union {
+-		struct {
+-			uint32_t reserved10:2;
+-			unsigned int address_lo_32b:30;
+-		} bitfields4;
+-		struct {
+-			uint32_t reserved11:3;
+-			uint32_t address_lo_64b:29;
+-		} bitfields4b;
+-		uint32_t reserved12;
+-		unsigned int ordinal4;
+-	};
+-
+-	union {
+-		uint32_t address_hi;
+-		uint32_t reserved13;
+-		uint32_t ordinal5;
+-	};
+-
+-	union {
+-		uint32_t data_lo;
+-		uint32_t cmp_data_lo;
+-		struct {
+-			uint32_t dw_offset:16;
+-			uint32_t num_dwords:16;
+-		} bitfields6c;
+-		uint32_t reserved14;
+-		uint32_t ordinal6;
+-	};
+-
+-	union {
+-		uint32_t data_hi;
+-		uint32_t cmp_data_hi;
+-		uint32_t reserved15;
+-		uint32_t reserved16;
+-		uint32_t ordinal7;
+-	};
+-
+-	uint32_t int_ctxid;
+-
+-};
+-
+-#endif
+-
+-enum {
+-	CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014
+-};
+-#endif
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h
+index 0b314a8..a0ff348 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h
+@@ -77,6 +77,103 @@ struct pm4__indirect_buffer_pasid {
+ 
+ #endif
+ 
++/*--------------------_RELEASE_MEM-------------------- */
++
++#ifndef _PM4__RELEASE_MEM_DEFINED
++#define _PM4__RELEASE_MEM_DEFINED
++enum _RELEASE_MEM_event_index_enum {
++	event_index___release_mem__end_of_pipe = 5,
++	event_index___release_mem__shader_done = 6
++};
++
++enum _RELEASE_MEM_cache_policy_enum {
++	cache_policy___release_mem__lru = 0,
++	cache_policy___release_mem__stream = 1,
++	cache_policy___release_mem__bypass = 2
++};
++
++enum _RELEASE_MEM_dst_sel_enum {
++	dst_sel___release_mem__memory_controller = 0,
++	dst_sel___release_mem__tc_l2 = 1,
++	dst_sel___release_mem__queue_write_pointer_register = 2,
++	dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3
++};
++
++enum _RELEASE_MEM_int_sel_enum {
++	int_sel___release_mem__none = 0,
++	int_sel___release_mem__send_interrupt_only = 1,
++	int_sel___release_mem__send_interrupt_after_write_confirm = 2,
++	int_sel___release_mem__send_data_after_write_confirm = 3
++};
++
++enum _RELEASE_MEM_data_sel_enum {
++	data_sel___release_mem__none = 0,
++	data_sel___release_mem__send_32_bit_low = 1,
++	data_sel___release_mem__send_64_bit_data = 2,
++	data_sel___release_mem__send_gpu_clock_counter = 3,
++	data_sel___release_mem__send_cp_perfcounter_hi_lo = 4,
++	data_sel___release_mem__store_gds_data_to_memory = 5
++};
++
++struct pm4__release_mem {
++	union {
++		union PM4_MES_TYPE_3_HEADER header;	/*header */
++		unsigned int ordinal1;
++	};
++
++	union {
++		struct {
++			unsigned int event_type:6;
++			unsigned int reserved1:2;
++			enum _RELEASE_MEM_event_index_enum event_index:4;
++			unsigned int tcl1_vol_action_ena:1;
++			unsigned int tc_vol_action_ena:1;
++			unsigned int reserved2:1;
++			unsigned int tc_wb_action_ena:1;
++			unsigned int tcl1_action_ena:1;
++			unsigned int tc_action_ena:1;
++			unsigned int reserved3:6;
++			unsigned int atc:1;
++			enum _RELEASE_MEM_cache_policy_enum cache_policy:2;
++			unsigned int reserved4:5;
++		} bitfields2;
++		unsigned int ordinal2;
++	};
++
++	union {
++		struct {
++			unsigned int reserved5:16;
++			enum _RELEASE_MEM_dst_sel_enum dst_sel:2;
++			unsigned int reserved6:6;
++			enum _RELEASE_MEM_int_sel_enum int_sel:3;
++			unsigned int reserved7:2;
++			enum _RELEASE_MEM_data_sel_enum data_sel:3;
++		} bitfields3;
++		unsigned int ordinal3;
++	};
++
++	union {
++		struct {
++			unsigned int reserved8:2;
++			unsigned int address_lo_32b:30;
++		} bitfields4;
++		struct {
++			unsigned int reserved9:3;
++			unsigned int address_lo_64b:29;
++		} bitfields5;
++		unsigned int ordinal4;
++	};
++
++	unsigned int address_hi;
++
++	unsigned int data_lo;
++
++	unsigned int data_hi;
++
++};
++#endif
++
++
+ /*--------------------_SET_CONFIG_REG-------------------- */
+ 
+ #ifndef _PM4__SET_CONFIG_REG_DEFINED
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 28fac2d..b397ec7 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -24,55 +24,19 @@
+ #define KFD_PRIV_H_INCLUDED
+ 
+ #include <linux/hashtable.h>
+-#include <linux/version.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/mutex.h>
+ #include <linux/types.h>
+ #include <linux/atomic.h>
+ #include <linux/workqueue.h>
+ #include <linux/spinlock.h>
+-#include <linux/idr.h>
+ #include <linux/kfd_ioctl.h>
+-#include <linux/pid.h>
+-#include <linux/interval_tree.h>
+-#include <linux/seq_file.h>
+-#include <linux/kref.h>
+-#include <linux/kfifo.h>
+ #include <kgd_kfd_interface.h>
+ 
+-#include <drm/amd_rdma.h>
+-#include "amd_shared.h"
+-
+ #define KFD_SYSFS_FILE_MODE 0444
+ 
+-/* GPU ID hash width in bits */
+-#define KFD_GPU_ID_HASH_WIDTH 16
+-
+-/* Use upper bits of mmap offset to store KFD driver specific information.
+- * BITS[63:62] - Encode MMAP type
+- * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to
+- * BITS[45:40] - Reserved. Not Used.
+- * BITS[39:0]  - MMAP offset value. Used by TTM.
+- *
+- * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+- *  defines are w.r.t to PAGE_SIZE
+- */
+-#define KFD_MMAP_TYPE_SHIFT	(62 - PAGE_SHIFT)
+-#define KFD_MMAP_TYPE_MASK	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
+-#define KFD_MMAP_TYPE_DOORBELL	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
+-#define KFD_MMAP_TYPE_EVENTS	(0x2ULL << KFD_MMAP_TYPE_SHIFT)
+-#define KFD_MMAP_TYPE_RESERVED_MEM	(0x1ULL << KFD_MMAP_TYPE_SHIFT)
+-
+-#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
+-#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
+-				<< KFD_MMAP_GPU_ID_SHIFT)
+-#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\
+-				& KFD_MMAP_GPU_ID_MASK)
+-#define KFD_MMAP_GPU_ID_GET(offset)    ((offset & KFD_MMAP_GPU_ID_MASK) \
+-				>> KFD_MMAP_GPU_ID_SHIFT)
+-
+-#define KFD_MMAP_OFFSET_VALUE_MASK	(0xFFFFFFFFFFULL >> PAGE_SHIFT)
+-#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK)
++#define KFD_MMAP_DOORBELL_MASK 0x8000000000000
++#define KFD_MMAP_EVENTS_MASK 0x4000000000000
+ 
+ /*
+  * When working with cp scheduler we should assign the HIQ manually or via
+@@ -84,6 +48,8 @@
+ #define KFD_CIK_HIQ_PIPE 4
+ #define KFD_CIK_HIQ_QUEUE 0
+ 
++/* GPU ID hash width in bits */
++#define KFD_GPU_ID_HASH_WIDTH 16
+ 
+ /* Macro for allocating structures */
+ #define kfd_alloc_struct(ptr_to_struct)	\
+@@ -93,15 +59,6 @@
+ #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
+ 
+ /*
+- * Size of the per-process TBA+TMA buffer: 2 pages
+- *
+- * The first page is the TBA used for the CWSR ISA code. The second
+- * page is used as TMA for daisy changing a user-mode trap handler.
+- */
+-#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
+-#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
+-
+-/*
+  * Kernel module parameter to specify maximum number of supported queues per
+  * device
+  */
+@@ -117,50 +74,12 @@ extern int max_num_of_queues_per_device;
+ /* Kernel module parameter to specify the scheduling policy */
+ extern int sched_policy;
+ 
+-extern int cwsr_enable;
+-
+-/*
+- * Kernel module parameter to specify the maximum process
+- * number per HW scheduler
+- */
+-extern int hws_max_conc_proc;
+-
+ /*
+  * Kernel module parameter to specify whether to send sigterm to HSA process on
+  * unhandled exception
+  */
+ extern int send_sigterm;
+ 
+-/*
+- * This kernel module is used to simulate large bar machine on non-large bar
+- * enabled machines.
+- */
+-extern int debug_largebar;
+-
+-/*
+- * Ignore CRAT table during KFD initialization, can be used to work around
+- * broken CRAT tables on some AMD systems
+- */
+-extern int ignore_crat;
+-
+-/*
+- * Set sh_mem_config.retry_disable on Vega10
+- */
+-extern int vega10_noretry;
+-
+-/*
+- * Enable privileged mode for all CP queues including user queues
+- */
+-extern int priv_cp_queues;
+-
+-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) && defined(BUILD_AS_DKMS)
+-/*
+- * Currently, mm_access() function is not exported. So for DKMS build,
+- * CMA will be enabled only if module param is set.
+- */
+-extern int cma_enable;
+-#endif
+-
+ /**
+  * enum kfd_sched_policy
+  *
+@@ -193,30 +112,26 @@ enum cache_policy {
+ 	cache_policy_noncoherent
+ };
+ 
+-#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10)
++enum asic_family_type {
++	CHIP_KAVERI = 0,
++	CHIP_CARRIZO
++};
+ 
+ struct kfd_event_interrupt_class {
+ 	bool (*interrupt_isr)(struct kfd_dev *dev,
+-			const uint32_t *ih_ring_entry, uint32_t *patched_ihre,
+-			bool *patched_flag);
++				const uint32_t *ih_ring_entry);
+ 	void (*interrupt_wq)(struct kfd_dev *dev,
+-			const uint32_t *ih_ring_entry);
++				const uint32_t *ih_ring_entry);
+ };
+ 
+ struct kfd_device_info {
+-	enum amd_asic_type asic_family;
++	unsigned int asic_family;
+ 	const struct kfd_event_interrupt_class *event_interrupt_class;
+ 	unsigned int max_pasid_bits;
+ 	unsigned int max_no_of_hqd;
+-	unsigned int doorbell_size;
+ 	size_t ih_ring_entry_size;
+ 	uint8_t num_of_watch_points;
+ 	uint16_t mqd_size_aligned;
+-	bool is_need_iommu_device;
+-	bool supports_cwsr;
+-	bool needs_pci_atomics;
+-	/* obtain from adev->sdma.num_instances */
+-	unsigned int num_sdma_engines;
+ };
+ 
+ struct kfd_mem_obj {
+@@ -224,13 +139,6 @@ struct kfd_mem_obj {
+ 	uint32_t range_end;
+ 	uint64_t gpu_addr;
+ 	uint32_t *cpu_ptr;
+-	void *gtt_mem;
+-};
+-
+-struct kfd_vmid_info {
+-	uint32_t first_vmid_kfd;
+-	uint32_t last_vmid_kfd;
+-	uint32_t vmid_num_kfd;
+ };
+ 
+ struct kfd_dev {
+@@ -249,12 +157,14 @@ struct kfd_dev {
+ 					 * to HW doorbell, GFX reserved some
+ 					 * at the start)
+ 					 */
++	size_t doorbell_process_limit;	/* Number of processes we have doorbell
++					 * space for.
++					 */
+ 	u32 __iomem *doorbell_kernel_ptr; /* This is a pointer for a doorbells
+ 					   * page used by kernel queue
+ 					   */
+ 
+ 	struct kgd2kfd_shared_resources shared_resources;
+-	struct kfd_vmid_info vm_info;
+ 
+ 	const struct kfd2kgd_calls *kfd2kgd;
+ 	struct mutex doorbell_mutex;
+@@ -270,8 +180,10 @@ struct kfd_dev {
+ 	unsigned int gtt_sa_num_of_chunks;
+ 
+ 	/* Interrupts */
+-	struct kfifo ih_fifo;
+-	struct workqueue_struct *ih_wq;
++	void *interrupt_ring;
++	size_t interrupt_ring_size;
++	atomic_t interrupt_ring_rptr;
++	atomic_t interrupt_ring_wptr;
+ 	struct work_struct interrupt_work;
+ 	spinlock_t interrupt_lock;
+ 
+@@ -279,7 +191,6 @@ struct kfd_dev {
+ 	struct device_queue_manager *dqm;
+ 
+ 	bool init_complete;
+-
+ 	/*
+ 	 * Interrupts of interest to KFD are copied
+ 	 * from the HW ring into a SW ring.
+@@ -287,31 +198,7 @@ struct kfd_dev {
+ 	bool interrupts_active;
+ 
+ 	/* Debug manager */
+-	struct kfd_dbgmgr *dbgmgr;
+-
+-	/* MEC firmware version*/
+-	uint16_t mec_fw_version;
+-
+-	/* Maximum process number mapped to HW scheduler */
+-	unsigned int max_proc_per_quantum;
+-
+-	/* CWSR */
+-	bool cwsr_enabled;
+-	const void *cwsr_isa;
+-	unsigned int cwsr_isa_size;
+-
+-	/* IB usage */
+-	uint32_t ib_size;
+-};
+-
+-struct kfd_ipc_obj;
+-
+-struct kfd_bo {
+-	void *mem;
+-	struct interval_tree_node it;
+-	struct kfd_dev *dev;
+-	struct list_head cb_data_head;
+-	struct kfd_ipc_obj *kfd_ipc_obj;
++	struct kfd_dbgmgr           *dbgmgr;
+ };
+ 
+ /* KGD2KFD callbacks */
+@@ -334,22 +221,22 @@ void kfd_chardev_exit(void);
+ struct device *kfd_chardev(void);
+ 
+ /**
+- * enum kfd_unmap_queues_filter
++ * enum kfd_preempt_type_filter
+  *
+- * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue.
++ * @KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE: Preempts single queue.
+  *
+- * @KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: Preempts all queues in the
++ * @KFD_PRERMPT_TYPE_FILTER_ALL_QUEUES: Preempts all queues in the
+  *						running queues list.
+  *
+- * @KFD_UNMAP_QUEUES_FILTER_BY_PASID: Preempts queues that belongs to
++ * @KFD_PRERMPT_TYPE_FILTER_BY_PASID: Preempts queues that belongs to
+  *						specific process.
+  *
+  */
+-enum kfd_unmap_queues_filter {
+-	KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE,
+-	KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+-	KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
+-	KFD_UNMAP_QUEUES_FILTER_BY_PASID
++enum kfd_preempt_type_filter {
++	KFD_PREEMPT_TYPE_FILTER_SINGLE_QUEUE,
++	KFD_PREEMPT_TYPE_FILTER_ALL_QUEUES,
++	KFD_PREEMPT_TYPE_FILTER_DYNAMIC_QUEUES,
++	KFD_PREEMPT_TYPE_FILTER_BY_PASID
+ };
+ 
+ /**
+@@ -375,11 +262,6 @@ enum kfd_queue_format {
+ 	KFD_QUEUE_FORMAT_AQL
+ };
+ 
+-enum KFD_QUEUE_PRIORITY {
+-	KFD_QUEUE_PRIORITY_MINIMUM = 0,
+-	KFD_QUEUE_PRIORITY_MAXIMUM = 15
+-};
+-
+ /**
+  * struct queue_properties
+  *
+@@ -434,10 +316,9 @@ struct queue_properties {
+ 	uint32_t queue_percent;
+ 	uint32_t *read_ptr;
+ 	uint32_t *write_ptr;
+-	void __iomem *doorbell_ptr;
++	uint32_t __iomem *doorbell_ptr;
+ 	uint32_t doorbell_off;
+ 	bool is_interop;
+-	bool is_evicted; /* true -> queue is evicted */
+ 	bool is_active;
+ 	/* Not relevant for user mode queues in cp scheduling */
+ 	unsigned int vmid;
+@@ -450,12 +331,6 @@ struct queue_properties {
+ 	uint32_t eop_ring_buffer_size;
+ 	uint64_t ctx_save_restore_area_address;
+ 	uint32_t ctx_save_restore_area_size;
+-	uint32_t ctl_stack_size;
+-	uint64_t tba_addr;
+-	uint64_t tma_addr;
+-	/* Relevant for CU */
+-	uint32_t cu_mask_count; /* Must be a multiple of 32 */
+-	uint32_t *cu_mask;
+ };
+ 
+ /**
+@@ -500,7 +375,6 @@ struct queue {
+ 	uint32_t queue;
+ 
+ 	unsigned int sdma_id;
+-	unsigned int doorbell_id;
+ 
+ 	struct kfd_process	*process;
+ 	struct kfd_dev		*device;
+@@ -517,19 +391,6 @@ enum KFD_MQD_TYPE {
+ 	KFD_MQD_TYPE_MAX
+ };
+ 
+-enum KFD_PIPE_PRIORITY {
+-	KFD_PIPE_PRIORITY_CS_LOW = 0,
+-	KFD_PIPE_PRIORITY_CS_MEDIUM,
+-	KFD_PIPE_PRIORITY_CS_HIGH
+-};
+-
+-enum KFD_SPI_PRIORITY {
+-	KFD_SPI_PRIORITY_EXTRA_LOW = 0,
+-	KFD_SPI_PRIORITY_LOW,
+-	KFD_SPI_PRIORITY_MEDIUM,
+-	KFD_SPI_PRIORITY_HIGH
+-};
+-
+ struct scheduling_resources {
+ 	unsigned int vmid_mask;
+ 	enum kfd_queue_type type;
+@@ -543,6 +404,7 @@ struct scheduling_resources {
+ struct process_queue_manager {
+ 	/* data */
+ 	struct kfd_process	*process;
++	unsigned int		num_concurrent_processes;
+ 	struct list_head	queues;
+ 	unsigned long		*queue_slot_bitmap;
+ };
+@@ -556,16 +418,8 @@ struct qcm_process_device {
+ 	struct list_head priv_queue_list;
+ 
+ 	unsigned int queue_count;
+-	/* a data field only meaningful for non-HWS case */
+ 	unsigned int vmid;
+ 	bool is_debug;
+-	unsigned int evicted; /* eviction counter, 0=active */
+-
+-	/* This flag tells if we should reset all wavefronts on
+-	 * process termination
+-	 */
+-	bool reset_wavefronts;
+-
+ 	/*
+ 	 * All the memory management data should be here too
+ 	 */
+@@ -579,49 +433,6 @@ struct qcm_process_device {
+ 	uint32_t num_gws;
+ 	uint32_t num_oac;
+ 	uint32_t sh_hidden_private_base;
+-
+-	/* CWSR memory */
+-	void *cwsr_kaddr;
+-	uint64_t cwsr_base;
+-	uint64_t tba_addr;
+-	uint64_t tma_addr;
+-
+-	/* IB memory */
+-	uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */
+-	void *ib_kaddr;
+-
+-	/*doorbell resources per process per device*/
+-	unsigned long           *doorbell_bitmap;
+-};
+-
+-/* KFD Memory Eviction */
+-
+-/* Approx. wait time before attempting to restore evicted BOs */
+-#define PROCESS_RESTORE_TIME_MS 100
+-/* Approx. back off time if restore fails due to lack of memory */
+-#define PROCESS_BACK_OFF_TIME_MS 100
+-/* Approx. time before evicting the process again */
+-#define PROCESS_ACTIVE_TIME_MS 10
+-
+-void kfd_evict_bo_worker(struct work_struct *work);
+-void kfd_restore_bo_worker(struct work_struct *work);
+-int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
+-					       struct dma_fence *fence);
+-int quiesce_process_mm(struct kfd_process *p);
+-
+-
+-/* 8 byte handle containing GPU ID in the most significant 4 bytes and
+- * idr_handle in the least significant 4 bytes
+- */
+-#define MAKE_HANDLE(gpu_id, idr_handle) \
+-	(((uint64_t)(gpu_id) << 32) + idr_handle)
+-#define GET_GPU_ID(handle) (handle >> 32)
+-#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
+-
+-enum kfd_pdd_bound {
+-	PDD_UNBOUND = 0,
+-	PDD_BOUND,
+-	PDD_BOUND_SUSPENDED,
+ };
+ 
+ /* Data that is per-process-per device. */
+@@ -635,8 +446,6 @@ struct kfd_process_device {
+ 	/* The device that owns this data. */
+ 	struct kfd_dev *dev;
+ 
+-	/* The process that owns this kfd_process_device. */
+-	struct kfd_process *process;
+ 
+ 	/* per-process-per device QCM data structure */
+ 	struct qcm_process_device qpd;
+@@ -648,24 +457,14 @@ struct kfd_process_device {
+ 	uint64_t gpuvm_limit;
+ 	uint64_t scratch_base;
+ 	uint64_t scratch_limit;
+-	uint64_t dgpu_base;
+-	uint64_t dgpu_limit;
+ 
+ 	/* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */
+-	enum kfd_pdd_bound bound;
+-
+-	/* VM context for GPUVM allocations */
+-	void *vm;
++	bool bound;
+ 
+-	/* GPUVM allocations storage */
+-	struct idr alloc_idr;
+-
+-	/* Flag used to tell the pdd has dequeued from the dqm.
+-	 * This is used to prevent dev->dqm->ops.process_termination() from
+-	 * being called twice when it is already called in IOMMU callback
+-	 * function.
++	/* This flag tells if we should reset all
++	 * wavefronts on process termination
+ 	 */
+-	bool already_dequeued;
++	bool reset_wavefronts;
+ };
+ 
+ #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
+@@ -678,15 +477,7 @@ struct kfd_process {
+ 	 */
+ 	struct hlist_node kfd_processes;
+ 
+-	/*
+-	 * Opaque pointer to mm_struct. We don't hold a reference to
+-	 * it so it should never be dereferenced from here. This is
+-	 * only used for looking up processes by their mm.
+-	 */
+-	void *mm;
+-
+-	struct kref ref;
+-	struct work_struct release_work;
++	struct mm_struct *mm;
+ 
+ 	struct mutex mutex;
+ 
+@@ -694,8 +485,6 @@ struct kfd_process {
+ 	 * In any process, the thread that started main() is the lead
+ 	 * thread and outlives the rest.
+ 	 * It is here because amd_iommu_bind_pasid wants a task_struct.
+-	 * It can also be used for safely getting a reference to the
+-	 * mm_struct of the process.
+ 	 */
+ 	struct task_struct *lead_thread;
+ 
+@@ -706,7 +495,6 @@ struct kfd_process {
+ 	struct rcu_head	rcu;
+ 
+ 	unsigned int pasid;
+-	unsigned int doorbell_index;
+ 
+ 	/*
+ 	 * List of kfd_process_device structures,
+@@ -716,37 +504,23 @@ struct kfd_process {
+ 
+ 	struct process_queue_manager pqm;
+ 
++	/* The process's queues. */
++	size_t queue_array_size;
++
++	/* Size is queue_array_size, up to MAX_PROCESS_QUEUES. */
++	struct kfd_queue **queues;
++
+ 	/*Is the user space process 32 bit?*/
+ 	bool is_32bit_user_mode;
+ 
+ 	/* Event-related data */
+ 	struct mutex event_mutex;
+-	/* Event ID allocator and lookup */
+-	struct idr event_idr;
+-	/* Event page */
+-	struct kfd_signal_page *signal_page;
++	/* All events in process hashed by ID, linked on kfd_event.events. */
++	DECLARE_HASHTABLE(events, 4);
++	/* struct slot_page_header.event_pages */
++	struct list_head signal_event_pages;
++	u32 next_nonsignal_event_id;
+ 	size_t signal_event_count;
+-	bool signal_event_limit_reached;
+-
+-	struct rb_root_cached bo_interval_tree;
+-
+-	/* Information used for memory eviction */
+-	void *process_info;
+-	/* Eviction fence that is attached to all the BOs of this process. The
+-	 * fence will be triggered during eviction and new one will be created
+-	 * during restore
+-	 */
+-	struct dma_fence *ef;
+-
+-	/* Work items for evicting and restoring BOs */
+-	struct delayed_work eviction_work;
+-	struct delayed_work restore_work;
+-	/* seqno of the last scheduled eviction */
+-	unsigned int last_eviction_seqno;
+-	/* Approx. the last timestamp (in jiffies) when the process was
+-	 * restored after an eviction
+-	 */
+-	unsigned long last_restore_timestamp;
+ };
+ 
+ /**
+@@ -769,50 +543,18 @@ struct amdkfd_ioctl_desc {
+ 
+ void kfd_process_create_wq(void);
+ void kfd_process_destroy_wq(void);
+-struct kfd_process *kfd_create_process(struct file *filep);
+-struct kfd_process *kfd_get_process(const struct task_struct *task);
++struct kfd_process *kfd_create_process(const struct task_struct *);
++struct kfd_process *kfd_get_process(const struct task_struct *);
+ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+-struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm);
+-void kfd_unref_process(struct kfd_process *p);
+-void kfd_suspend_all_processes(void);
+-int kfd_resume_all_processes(void);
+ 
+ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-int kfd_bind_processes_to_device(struct kfd_dev *dev);
+-void kfd_unbind_processes_from_device(struct kfd_dev *dev);
+-#endif
+-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid);
++void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
+ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p);
+ 
+-int kfd_reserved_mem_mmap(struct kfd_process *process,
+-		struct vm_area_struct *vma);
+-
+-/* KFD process API for creating and translating handles */
+-int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+-					void *mem, uint64_t start,
+-					uint64_t length,
+-					struct kfd_ipc_obj *ipc_obj);
+-void *kfd_process_device_translate_handle(struct kfd_process_device *p,
+-					int handle);
+-struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
+-					int handle);
+-void *kfd_process_find_bo_from_interval(struct kfd_process *p,
+-					uint64_t start_addr,
+-					uint64_t last_addr);
+-void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+-					int handle);
+-
+-void run_rdma_free_callback(struct kfd_bo *buf_obj);
+-struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid);
+-
+-/* kfd dgpu memory */
+-int kfd_unmap_memory_from_gpu(void *mem, struct kfd_process_device *pdd);
+-
+ /* Process device data iterator */
+ struct kfd_process_device *kfd_get_first_process_device_data(
+ 							struct kfd_process *p);
+@@ -830,24 +572,17 @@ unsigned int kfd_pasid_alloc(void);
+ void kfd_pasid_free(unsigned int pasid);
+ 
+ /* Doorbells */
+-size_t kfd_doorbell_process_slice(struct kfd_dev *kfd);
+ int kfd_doorbell_init(struct kfd_dev *kfd);
+ void kfd_doorbell_fini(struct kfd_dev *kfd);
+-int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process,
+-		      struct vm_area_struct *vma);
+-void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
++int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma);
++u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
+ 					unsigned int *doorbell_off);
+ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr);
+ u32 read_kernel_doorbell(u32 __iomem *db);
+-void write_kernel_doorbell(void __iomem *db, u32 value);
+-void write_kernel_doorbell64(void __iomem *db, u64 value);
+-unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd,
++void write_kernel_doorbell(u32 __iomem *db, u32 value);
++unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
+ 					struct kfd_process *process,
+-					unsigned int doorbell_id);
+-phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
+-					struct kfd_process *process);
+-int kfd_alloc_process_doorbells(struct kfd_process *process);
+-void kfd_free_process_doorbells(struct kfd_process *process);
++					unsigned int queue_id);
+ 
+ /* GTT Sub-Allocator */
+ 
+@@ -863,37 +598,27 @@ int kfd_topology_init(void);
+ void kfd_topology_shutdown(void);
+ int kfd_topology_add_device(struct kfd_dev *gpu);
+ int kfd_topology_remove_device(struct kfd_dev *gpu);
+-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+-						uint32_t proximity_domain);
+ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
+ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
+-struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd);
+-int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
+-int kfd_numa_node_to_apic_id(int numa_node_id);
++struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
+ 
+ /* Interrupts */
+ int kfd_interrupt_init(struct kfd_dev *dev);
+ void kfd_interrupt_exit(struct kfd_dev *dev);
+ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
+ bool enqueue_ih_ring_entry(struct kfd_dev *kfd,	const void *ih_ring_entry);
+-bool interrupt_is_wanted(struct kfd_dev *dev,
+-				const uint32_t *ih_ring_entry,
+-				uint32_t *patched_ihre, bool *flag);
++bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry);
+ 
+ /* Power Management */
+ void kgd2kfd_suspend(struct kfd_dev *kfd);
+ int kgd2kfd_resume(struct kfd_dev *kfd);
+ 
+-/* GPU reset */
+-int kgd2kfd_pre_reset(struct kfd_dev *kfd);
+-int kgd2kfd_post_reset(struct kfd_dev *kfd);
+-
+ /* amdkfd Apertures */
+ int kfd_init_apertures(struct kfd_process *process);
+-int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
+-				uint64_t base, uint64_t limit);
+ 
+ /* Queue Context Management */
++struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
++
+ int init_queue(struct queue **q, const struct queue_properties *properties);
+ void uninit_queue(struct queue *q);
+ void print_queue_properties(struct queue_properties *q);
+@@ -903,20 +628,13 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
+ 					struct kfd_dev *dev);
+ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev);
+-struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
+-		struct kfd_dev *dev);
+ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
+ 		struct kfd_dev *dev);
+-struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
+-		struct kfd_dev *dev);
+-struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
+-		struct kfd_dev *dev);
+ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
+ void device_queue_manager_uninit(struct device_queue_manager *dqm);
+ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
+ 					enum kfd_queue_type type);
+ void kernel_queue_uninit(struct kernel_queue *kq);
+-int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
+ 
+ /* Process Queue Manager */
+ struct process_queue_node {
+@@ -925,40 +643,32 @@ struct process_queue_node {
+ 	struct list_head process_queue_list;
+ };
+ 
+-void kfd_process_dequeue_from_device(struct kfd_process_device *pdd);
+-void kfd_process_dequeue_from_all_devices(struct kfd_process *p);
+ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p);
+ void pqm_uninit(struct process_queue_manager *pqm);
+ int pqm_create_queue(struct process_queue_manager *pqm,
+ 			    struct kfd_dev *dev,
+ 			    struct file *f,
+ 			    struct queue_properties *properties,
++			    unsigned int flags,
++			    enum kfd_queue_type type,
+ 			    unsigned int *qid);
+ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid);
+ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+ 			struct queue_properties *p);
+-int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+-			struct queue_properties *p);
+ struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
+ 						unsigned int qid);
+-int pqm_get_wave_state(struct process_queue_manager *pqm,
+-		       unsigned int qid,
+-		       void __user *ctl_stack,
+-		       u32 *ctl_stack_used_size,
+-		       u32 *save_area_used_size);
+-int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm);
+-int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm);
+ 
+ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
+ 				unsigned int fence_value,
+-				unsigned int timeout_ms);
++				unsigned long timeout);
+ 
+ /* Packet Manager */
+ 
++#define KFD_HIQ_TIMEOUT (500)
++
+ #define KFD_FENCE_COMPLETED (100)
+ #define KFD_FENCE_INIT   (10)
+-
+-struct packet_manager_func;
++#define KFD_UNMAP_LATENCY (150)
+ 
+ struct packet_manager {
+ 	struct device_queue_manager *dqm;
+@@ -966,42 +676,9 @@ struct packet_manager {
+ 	struct mutex lock;
+ 	bool allocated;
+ 	struct kfd_mem_obj *ib_buffer_obj;
+-	unsigned int ib_size_bytes;
+-
+-	struct packet_manager_funcs *pmf;
+ };
+ 
+-struct packet_manager_funcs {
+-	/* Support different firmware versions for PM4  packets */
+-	int (*map_process)(struct packet_manager *pm, uint32_t *buffer,
+-			struct qcm_process_device *qpd);
+-	int (*runlist)(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t ib, size_t ib_size_in_dwords, bool chain);
+-	int (*set_resources)(struct packet_manager *pm, uint32_t *buffer,
+-			struct scheduling_resources *res);
+-	int (*map_queues)(struct packet_manager *pm, uint32_t *buffer,
+-			struct queue *q, bool is_static);
+-	int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer,
+-			enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter mode,
+-			uint32_t filter_param, bool reset,
+-			unsigned int sdma_engine);
+-	int (*query_status)(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t fence_address,	uint32_t fence_value);
+-	uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer);
+-
+-	uint32_t (*get_map_process_packet_size)(void);
+-	uint32_t (*get_runlist_packet_size)(void);
+-	uint32_t (*get_set_resources_packet_size)(void);
+-	uint32_t (*get_map_queues_packet_size)(void);
+-	uint32_t (*get_unmap_queues_packet_size)(void);
+-	uint32_t (*get_query_status_packet_size)(void);
+-	uint32_t (*get_release_mem_packet_size)(void);
+-
+-};
+-
+-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+-		uint16_t fw_ver);
++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
+ void pm_uninit(struct packet_manager *pm);
+ int pm_send_set_resources(struct packet_manager *pm,
+ 				struct scheduling_resources *res);
+@@ -1010,109 +687,47 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
+ 				uint32_t fence_value);
+ 
+ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter mode,
++			enum kfd_preempt_type_filter mode,
+ 			uint32_t filter_param, bool reset,
+ 			unsigned int sdma_engine);
+ 
+ void pm_release_ib(struct packet_manager *pm);
+ 
+-/* Following  PM funcs can be shared among CIK and VI  */
+-unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
+-int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t ib, size_t ib_size_in_dwords, bool chain);
+-int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+-				struct queue *q, bool is_static);
+-int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
+-				struct scheduling_resources *res);
+-int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
+-			enum kfd_queue_type type,
+-			enum kfd_unmap_queues_filter filter,
+-			uint32_t filter_param, bool reset,
+-			unsigned int sdma_engine);
+-int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer,
+-			uint64_t fence_address,	uint32_t fence_value);
+-uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer);
+-
+-uint32_t pm_get_map_process_packet_size_vi(void);
+-uint32_t pm_get_runlist_packet_size_vi(void);
+-uint32_t pm_get_set_resources_packet_size_vi(void);
+-uint32_t pm_get_map_queues_packet_size_vi(void);
+-uint32_t pm_get_unmap_queues_packet_size_vi(void);
+-uint32_t pm_get_query_status_packet_size_vi(void);
+-uint32_t pm_get_release_mem_packet_size_vi(void);
+-
+-
+-void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver);
+-void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver);
+-
+-void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver);
+-
+-
+ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
++phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev,
++					struct kfd_process *process);
+ 
+ /* Events */
+ extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
+-extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
+-
+ extern const struct kfd_device_global_init_class device_global_init_class_cik;
+ 
++enum kfd_event_wait_result {
++	KFD_WAIT_COMPLETE,
++	KFD_WAIT_TIMEOUT,
++	KFD_WAIT_ERROR
++};
++
+ void kfd_event_init_process(struct kfd_process *p);
+ void kfd_event_free_process(struct kfd_process *p);
+ int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
+ int kfd_wait_on_events(struct kfd_process *p,
+ 		       uint32_t num_events, void __user *data,
+ 		       bool all, uint32_t user_timeout_ms,
+-		       uint32_t *wait_result);
++		       enum kfd_event_wait_result *wait_result);
+ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
+ 				uint32_t valid_id_bits);
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ void kfd_signal_iommu_event(struct kfd_dev *dev,
+ 		unsigned int pasid, unsigned long address,
+ 		bool is_write_requested, bool is_execute_requested);
+-#endif
+ void kfd_signal_hw_exception_event(unsigned int pasid);
+ int kfd_set_event(struct kfd_process *p, uint32_t event_id);
+ int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
+ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
+ 		     uint32_t event_type, bool auto_reset, uint32_t node_id,
+ 		     uint32_t *event_id, uint32_t *event_trigger_data,
+-		     uint64_t *event_page_offset, uint32_t *event_slot_index,
+-		     void *kern_addr);
++		     uint64_t *event_page_offset, uint32_t *event_slot_index);
+ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
+ 
+-void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+-				struct kfd_vm_fault_info *info);
+-
+-void kfd_flush_tlb(struct kfd_dev *dev, struct kfd_process *p);
+-
+ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
+ 
+-#define KFD_SCRATCH_KV_FW_VER 413
+-
+-/* PeerDirect support */
+-void kfd_init_peer_direct(void);
+-void kfd_close_peer_direct(void);
+-
+-/* IPC Support */
+-int kfd_ipc_init(void);
+-
+-/* Debugfs */
+-#if defined(CONFIG_DEBUG_FS)
+-
+-void kfd_debugfs_init(void);
+-void kfd_debugfs_fini(void);
+-int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data);
+-int pqm_debugfs_mqds(struct seq_file *m, void *data);
+-int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data);
+-int device_queue_manager_debugfs_hqds(struct seq_file *m, void *data);
+-int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
+-int pm_debugfs_runlist(struct seq_file *m, void *data);
+-
+-#else
+-
+-static inline void kfd_debugfs_init(void) {}
+-static inline void kfd_debugfs_fini(void) {}
+-
+-#endif
+-
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index 71438ac..c74cf22 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -24,24 +24,24 @@
+ #include <linux/log2.h>
+ #include <linux/sched.h>
+ #include <linux/sched/mm.h>
+-#include <linux/sched/task.h>
+ #include <linux/slab.h>
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+ #include <linux/amd-iommu.h>
+-#endif
+ #include <linux/notifier.h>
+ #include <linux/compat.h>
+-#include <linux/mman.h>
+-#include <asm/page.h>
+-#include "kfd_ipc.h"
+ 
+ struct mm_struct;
+ 
+ #include "kfd_priv.h"
+-#include "kfd_device_queue_manager.h"
+ #include "kfd_dbgmgr.h"
+ 
+ /*
++ * Initial size for the array of queues.
++ * The allocated size is doubled each time
++ * it is exceeded up to MAX_PROCESS_QUEUES.
++ */
++#define INITIAL_QUEUE_ARRAY_SIZE 16
++
++/*
+  * List of struct kfd_process (field kfd_process).
+  * Unique/indexed by mm_struct*
+  */
+@@ -53,16 +53,13 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu);
+ 
+ static struct workqueue_struct *kfd_process_wq;
+ 
+-#define MIN_IDR_ID 1
+-#define MAX_IDR_ID 0 /*0 - for unlimited*/
+-
+-static struct kfd_process *find_process(const struct task_struct *thread,
+-		bool ref);
+-static void kfd_process_ref_release(struct kref *ref);
+-static struct kfd_process *create_process(const struct task_struct *thread,
+-					struct file *filep);
+-static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
++struct kfd_process_release_work {
++	struct work_struct kfd_work;
++	struct kfd_process *p;
++};
+ 
++static struct kfd_process *find_process(const struct task_struct *thread);
++static struct kfd_process *create_process(const struct task_struct *thread);
+ 
+ void kfd_process_create_wq(void)
+ {
+@@ -78,135 +75,10 @@ void kfd_process_destroy_wq(void)
+ 	}
+ }
+ 
+-static void kfd_process_free_gpuvm(struct kgd_mem *mem,
+-			struct kfd_process_device *pdd)
+-{
+-	kfd_unmap_memory_from_gpu(mem, pdd);
+-	pdd->dev->kfd2kgd->free_memory_of_gpu(pdd->dev->kgd, mem, pdd->vm);
+-}
+-
+-/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
+- *	This function should be only called right after the process
+- *	is created and when kfd_processes_mutex is still being held
+- *	to avoid concurrency. Because of that exclusiveness, we do
+- *	not need to take p->mutex.
+- */
+-static int kfd_process_alloc_gpuvm(struct kfd_process *p,
+-		struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size,
+-		void **kptr, struct kfd_process_device *pdd, uint32_t flags)
+-{
+-	int err;
+-	void *mem = NULL;
+-	int handle;
+-
+-	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size,
+-				pdd->vm,
+-				(struct kgd_mem **)&mem, NULL, flags);
+-	if (err)
+-		goto err_alloc_mem;
+-
+-	err = kdev->kfd2kgd->map_memory_to_gpu(
+-				kdev->kgd, (struct kgd_mem *)mem, pdd->vm);
+-	if (err)
+-		goto err_map_mem;
+-
+-	err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem,
+-				true);
+-	if (err) {
+-		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+-		goto sync_memory_failed;
+-	}
+-
+-	/* Create an obj handle so kfd_process_device_remove_obj_handle
+-	 * will take care of the bo removal when the process finishes.
+-	 * We do not need to take p->mutex, because the process is just
+-	 * created and the ioctls have not had the chance to run.
+-	 */
+-	handle = kfd_process_device_create_obj_handle(
+-			pdd, mem, gpu_va, size, NULL);
+-
+-	if (handle < 0) {
+-		err = handle;
+-		goto free_gpuvm;
+-	}
+-
+-	if (kptr) {
+-		err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd,
+-				(struct kgd_mem *)mem, kptr);
+-		if (err) {
+-			pr_debug("Map GTT BO to kernel failed\n");
+-			goto free_obj_handle;
+-		}
+-	}
+-
+-	return err;
+-
+-free_obj_handle:
+-	kfd_process_device_remove_obj_handle(pdd, handle);
+-free_gpuvm:
+-sync_memory_failed:
+-	kfd_process_free_gpuvm(mem, pdd);
+-	return err;
+-
+-err_map_mem:
+-	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem, pdd->vm);
+-err_alloc_mem:
+-	*kptr = NULL;
+-	return err;
+-}
+-
+-/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage
+- *	The memory reserved is for KFD to submit IB to AMDGPU from kernel.
+- *	If the memory is reserved successfully, ib_kaddr_assigned will have
+- *	the CPU/kernel address. Check ib_kaddr_assigned before accessing the
+- *	memory.
+- */
+-static int kfd_process_reserve_ib_mem(struct kfd_process *p)
+-{
+-	int ret = 0;
+-	struct kfd_process_device *temp, *pdd = NULL;
+-	struct kfd_dev *kdev = NULL;
+-	struct qcm_process_device *qpd = NULL;
+-	void *kaddr;
+-	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
+-			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE |
+-			 ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
+-
+-	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+-				per_device_list) {
+-		kdev = pdd->dev;
+-		qpd = &pdd->qpd;
+-		if (!kdev->ib_size || qpd->ib_kaddr)
+-			continue;
+-
+-		if (qpd->ib_base) { /* is dGPU */
+-			ret = kfd_process_alloc_gpuvm(p, kdev,
+-				qpd->ib_base, kdev->ib_size,
+-				&kaddr, pdd, flags);
+-			if (!ret)
+-				qpd->ib_kaddr = kaddr;
+-			else
+-				/* In case of error, the kfd_bos for some pdds
+-				 * which are already allocated successfully
+-				 * will be freed in upper level function
+-				 * i.e. create_process().
+-				 */
+-				return ret;
+-		} else {
+-			/* FIXME: Support APU */
+-			continue;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+-struct kfd_process *kfd_create_process(struct file *filep)
++struct kfd_process *kfd_create_process(const struct task_struct *thread)
+ {
+ 	struct kfd_process *process;
+ 
+-	struct task_struct *thread = current;
+-
+ 	if (!thread->mm)
+ 		return ERR_PTR(-EINVAL);
+ 
+@@ -214,6 +86,9 @@ struct kfd_process *kfd_create_process(struct file *filep)
+ 	if (thread->group_leader->mm != thread->mm)
+ 		return ERR_PTR(-EINVAL);
+ 
++	/* Take mmap_sem because we call __mmu_notifier_register inside */
++	down_write(&thread->mm->mmap_sem);
++
+ 	/*
+ 	 * take kfd processes mutex before starting of process creation
+ 	 * so there won't be a case where two threads of the same process
+@@ -222,14 +97,17 @@ struct kfd_process *kfd_create_process(struct file *filep)
+ 	mutex_lock(&kfd_processes_mutex);
+ 
+ 	/* A prior open of /dev/kfd could have already created the process. */
+-	process = find_process(thread, false);
++	process = find_process(thread);
+ 	if (process)
+ 		pr_debug("Process already found\n");
+-	else
+-		process = create_process(thread, filep);
++
++	if (!process)
++		process = create_process(thread);
+ 
+ 	mutex_unlock(&kfd_processes_mutex);
+ 
++	up_write(&thread->mm->mmap_sem);
++
+ 	return process;
+ }
+ 
+@@ -244,7 +122,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread)
+ 	if (thread->group_leader->mm != thread->mm)
+ 		return ERR_PTR(-EINVAL);
+ 
+-	process = find_process(thread, false);
++	process = find_process(thread);
+ 
+ 	return process;
+ }
+@@ -261,156 +139,79 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm)
+ 	return NULL;
+ }
+ 
+-static struct kfd_process *find_process(const struct task_struct *thread,
+-		bool ref)
++static struct kfd_process *find_process(const struct task_struct *thread)
+ {
+ 	struct kfd_process *p;
+ 	int idx;
+ 
+ 	idx = srcu_read_lock(&kfd_processes_srcu);
+ 	p = find_process_by_mm(thread->mm);
+-	if (p && ref)
+-		kref_get(&p->ref);
+ 	srcu_read_unlock(&kfd_processes_srcu, idx);
+ 
+ 	return p;
+ }
+ 
+-void kfd_unref_process(struct kfd_process *p)
+-{
+-	kref_put(&p->ref, kfd_process_ref_release);
+-}
+-
+-/* This increments the process->ref counter. */
+-struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid)
++static void kfd_process_wq_release(struct work_struct *work)
+ {
+-	struct task_struct *task = NULL;
+-	struct kfd_process *p    = NULL;
+-
+-	if (!pid)
+-		task = current;
+-	else
+-		task = get_pid_task(pid, PIDTYPE_PID);
+-
+-	if (task)
+-		p = find_process(task, true);
++	struct kfd_process_release_work *my_work;
++	struct kfd_process_device *pdd, *temp;
++	struct kfd_process *p;
+ 
+-	return p;
+-}
++	my_work = (struct kfd_process_release_work *) work;
+ 
+-static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p)
+-{
+-	struct kfd_process_device *pdd, *peer_pdd;
+-	struct kfd_bo *buf_obj;
+-	int id;
++	p = my_work->p;
+ 
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		/*
+-		 * Remove all handles from idr and release appropriate
+-		 * local memory object
+-		 */
+-		idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) {
+-			list_for_each_entry(peer_pdd, &p->per_device_data,
+-					per_device_list) {
+-				peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu(
+-						peer_pdd->dev->kgd,
+-						buf_obj->mem, peer_pdd->vm);
+-			}
+-
+-			run_rdma_free_callback(buf_obj);
+-			pdd->dev->kfd2kgd->free_memory_of_gpu(
+-					pdd->dev->kgd, buf_obj->mem, pdd->vm);
+-			kfd_process_device_remove_obj_handle(pdd, id);
+-		}
+-	}
+-}
++	pr_debug("Releasing process (pasid %d) in workqueue\n",
++			p->pasid);
+ 
+-static void kfd_process_destroy_pdds(struct kfd_process *p)
+-{
+-	struct kfd_process_device *pdd, *temp;
++	mutex_lock(&p->mutex);
+ 
+ 	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+-				 per_device_list) {
+-		/* Destroy the GPUVM VM context */
+-		if (pdd->vm)
+-			pdd->dev->kfd2kgd->destroy_process_vm(
+-				pdd->dev->kgd, pdd->vm);
+-
+-		list_del(&pdd->per_device_list);
++							per_device_list) {
++		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
++				pdd->dev->id, p->pasid);
+ 
+-		if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base)
+-			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+-				get_order(KFD_CWSR_TBA_TMA_SIZE));
++		if (pdd->reset_wavefronts)
++			dbgdev_wave_reset_wavefronts(pdd->dev, p);
+ 
+-		kfree(pdd->qpd.doorbell_bitmap);
+-		idr_destroy(&pdd->alloc_idr);
++		amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
++		list_del(&pdd->per_device_list);
+ 
+ 		kfree(pdd);
+ 	}
+-}
+-
+-/* No process locking is needed in this function, because the process
+- * is not findable any more. We must assume that no other thread is
+- * using it any more, otherwise we couldn't safely free the process
+- * structure in the end.
+- */
+-static void kfd_process_wq_release(struct work_struct *work)
+-{
+-	struct kfd_process *p = container_of(work, struct kfd_process,
+-					     release_work);
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	struct kfd_process_device *pdd;
+-
+-	pr_debug("Releasing process (pasid %d)\n",
+-			p->pasid);
+-
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
+-				pdd->dev->id, p->pasid);
+-
+-		if (pdd->dev->device_info->is_need_iommu_device) {
+-			if (pdd->bound == PDD_BOUND) {
+-				amd_iommu_unbind_pasid(pdd->dev->pdev,
+-						p->pasid);
+-				pdd->bound = PDD_UNBOUND;
+-			}
+-		}
+-	}
+-#endif
+-
+-	kfd_process_free_outstanding_kfd_bos(p);
+-
+-	kfd_process_destroy_pdds(p);
+-	dma_fence_put(p->ef);
+ 
+ 	kfd_event_free_process(p);
+ 
+ 	kfd_pasid_free(p->pasid);
+-	kfd_free_process_doorbells(p);
++
++	mutex_unlock(&p->mutex);
+ 
+ 	mutex_destroy(&p->mutex);
+ 
+-	put_task_struct(p->lead_thread);
++	kfree(p->queues);
+ 
+ 	kfree(p);
++
++	kfree(work);
+ }
+ 
+-static void kfd_process_ref_release(struct kref *ref)
++static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+ {
+-	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
++	struct kfd_process_release_work *work;
++	struct kfd_process *p;
+ 
+-	if (WARN_ON(!kfd_process_wq))
+-		return;
++	p = container_of(rcu, struct kfd_process, rcu);
++	WARN_ON(atomic_read(&p->mm->mm_count) <= 0);
+ 
+-	INIT_WORK(&p->release_work, kfd_process_wq_release);
+-	queue_work(kfd_process_wq, &p->release_work);
+-}
++	mmdrop(p->mm);
+ 
+-static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+-{
+-	struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
++	work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC);
+ 
+-	kfd_unref_process(p);
++	if (work) {
++		INIT_WORK((struct work_struct *) work, kfd_process_wq_release);
++		work->p = p;
++		queue_work(kfd_process_wq, (struct work_struct *) work);
++	}
+ }
+ 
+ static void kfd_process_notifier_release(struct mmu_notifier *mn,
+@@ -432,37 +233,35 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
+ 	mutex_unlock(&kfd_processes_mutex);
+ 	synchronize_srcu(&kfd_processes_srcu);
+ 
+-	cancel_delayed_work_sync(&p->eviction_work);
+-	cancel_delayed_work_sync(&p->restore_work);
+-
+ 	mutex_lock(&p->mutex);
+ 
+-	/* Iterate over all process device data structures and if the
+-	 * pdd is in debug mode, we should first force unregistration,
+-	 * then we will be able to destroy the queues
++	/* In case our notifier is called before IOMMU notifier */
++	pqm_uninit(&p->pqm);
++
++	/* Iterate over all process device data structure and check
++	 * if we should delete debug managers and reset all wavefronts
+ 	 */
+ 	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+-		struct kfd_dev *dev = pdd->dev;
+-
+-		mutex_lock(kfd_get_dbgmgr_mutex());
+-		if (dev && dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+-			if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+-				kfd_dbgmgr_destroy(dev->dbgmgr);
+-				dev->dbgmgr = NULL;
+-			}
++		if ((pdd->dev->dbgmgr) &&
++				(pdd->dev->dbgmgr->pasid == p->pasid))
++			kfd_dbgmgr_destroy(pdd->dev->dbgmgr);
++
++		if (pdd->reset_wavefronts) {
++			pr_warn("Resetting all wave fronts\n");
++			dbgdev_wave_reset_wavefronts(pdd->dev, p);
++			pdd->reset_wavefronts = false;
+ 		}
+-		mutex_unlock(kfd_get_dbgmgr_mutex());
+ 	}
+ 
+-	kfd_process_dequeue_from_all_devices(p);
+-	pqm_uninit(&p->pqm);
+-
+-	/* Indicate to other users that MM is no longer valid */
+-	p->mm = NULL;
+-
+ 	mutex_unlock(&p->mutex);
+ 
+-	mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
++	/*
++	 * Because we drop mm_count inside kfd_process_destroy_delayed
++	 * and because the mmu_notifier_unregister function also drop
++	 * mm_count we need to take an extra count here.
++	 */
++	mmgrab(p->mm);
++	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
+ 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+ }
+ 
+@@ -470,67 +269,7 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
+ 	.release = kfd_process_notifier_release,
+ };
+ 
+-static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
+-{
+-	int ret;
+-	unsigned long  offset;
+-	struct kfd_process_device *temp, *pdd = NULL;
+-	struct kfd_dev *dev = NULL;
+-	struct qcm_process_device *qpd = NULL;
+-	void *kaddr;
+-	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
+-			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE |
+-			 ALLOC_MEM_FLAGS_READONLY |
+-			 ALLOC_MEM_FLAGS_EXECUTE_ACCESS;
+-
+-	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+-				per_device_list) {
+-		dev = pdd->dev;
+-		qpd = &pdd->qpd;
+-		if (!dev->cwsr_enabled || qpd->cwsr_kaddr)
+-			continue;
+-		if (qpd->cwsr_base) {
+-			/* cwsr_base is only set for DGPU */
+-			ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base,
+-				KFD_CWSR_TBA_TMA_SIZE, &kaddr, pdd, flags);
+-			if (!ret) {
+-				qpd->cwsr_kaddr = kaddr;
+-				qpd->tba_addr = qpd->cwsr_base;
+-			} else
+-				/* In case of error, the kfd_bos for some pdds
+-				 * which are already allocated successfully
+-				 * will be freed in upper level function
+-				 * i.e. create_process().
+-				 */
+-				return ret;
+-		} else {
+-			offset = (dev->id |
+-				KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT;
+-			qpd->tba_addr = (uint64_t)vm_mmap(filep, 0,
+-				KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
+-				MAP_SHARED, offset);
+-
+-			if (IS_ERR_VALUE(qpd->tba_addr)) {
+-				pr_err("Failure to set tba address. error -%d.\n",
+-					(int)qpd->tba_addr);
+-				qpd->tba_addr = 0;
+-				qpd->cwsr_kaddr = NULL;
+-				return -ENOMEM;
+-			}
+-		}
+-
+-		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+-
+-		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
+-		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
+-			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+-	}
+-
+-	return 0;
+-}
+-
+-static struct kfd_process *create_process(const struct task_struct *thread,
+-					struct file *filep)
++static struct kfd_process *create_process(const struct task_struct *thread)
+ {
+ 	struct kfd_process *process;
+ 	int err = -ENOMEM;
+@@ -540,24 +279,22 @@ static struct kfd_process *create_process(const struct task_struct *thread,
+ 	if (!process)
+ 		goto err_alloc_process;
+ 
+-	process->bo_interval_tree = RB_ROOT_CACHED;
++	process->queues = kmalloc_array(INITIAL_QUEUE_ARRAY_SIZE,
++					sizeof(process->queues[0]), GFP_KERNEL);
++	if (!process->queues)
++		goto err_alloc_queues;
+ 
+ 	process->pasid = kfd_pasid_alloc();
+ 	if (process->pasid == 0)
+ 		goto err_alloc_pasid;
+ 
+-	if (kfd_alloc_process_doorbells(process) < 0)
+-		goto err_alloc_doorbells;
+-
+-	kref_init(&process->ref);
+-
+ 	mutex_init(&process->mutex);
+ 
+ 	process->mm = thread->mm;
+ 
+ 	/* register notifier */
+ 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+-	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
++	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
+ 	if (err)
+ 		goto err_mmu_notifier;
+ 
+@@ -565,7 +302,8 @@ static struct kfd_process *create_process(const struct task_struct *thread,
+ 			(uintptr_t)process->mm);
+ 
+ 	process->lead_thread = thread->group_leader;
+-	get_task_struct(process->lead_thread);
++
++	process->queue_array_size = INITIAL_QUEUE_ARRAY_SIZE;
+ 
+ 	INIT_LIST_HEAD(&process->per_device_data);
+ 
+@@ -581,28 +319,8 @@ static struct kfd_process *create_process(const struct task_struct *thread,
+ 	if (err != 0)
+ 		goto err_init_apertures;
+ 
+-	INIT_DELAYED_WORK(&process->eviction_work, kfd_evict_bo_worker);
+-	INIT_DELAYED_WORK(&process->restore_work, kfd_restore_bo_worker);
+-	process->last_restore_timestamp = get_jiffies_64();
+-
+-	err = kfd_process_reserve_ib_mem(process);
+-	if (err)
+-		goto err_reserve_ib_mem;
+-	err = kfd_process_init_cwsr(process, filep);
+-	if (err)
+-		goto err_init_cwsr;
+-
+-	/* If PeerDirect interface was not detected try to detect it again
+-	 * in case if network driver was loaded later.
+-	 */
+-	kfd_init_peer_direct();
+-
+ 	return process;
+ 
+-err_init_cwsr:
+-err_reserve_ib_mem:
+-	kfd_process_free_outstanding_kfd_bos(process);
+-	kfd_process_destroy_pdds(process);
+ err_init_apertures:
+ 	pqm_uninit(&process->pqm);
+ err_process_pqm_init:
+@@ -611,40 +329,15 @@ static struct kfd_process *create_process(const struct task_struct *thread,
+ 	mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
+ err_mmu_notifier:
+ 	mutex_destroy(&process->mutex);
+-	kfd_free_process_doorbells(process);
+-err_alloc_doorbells:
+ 	kfd_pasid_free(process->pasid);
+ err_alloc_pasid:
++	kfree(process->queues);
++err_alloc_queues:
+ 	kfree(process);
+ err_alloc_process:
+ 	return ERR_PTR(err);
+ }
+ 
+-static int init_doorbell_bitmap(struct qcm_process_device *qpd,
+-			struct kfd_dev *dev)
+-{
+-	unsigned int i;
+-
+-	if (!KFD_IS_SOC15(dev->device_info->asic_family))
+-		return 0;
+-
+-	qpd->doorbell_bitmap =
+-		kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
+-				     BITS_PER_BYTE), GFP_KERNEL);
+-	if (!qpd->doorbell_bitmap)
+-		return -ENOMEM;
+-
+-	/* Mask out any reserved doorbells */
+-	for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++)
+-		if ((dev->shared_resources.reserved_doorbell_mask & i) ==
+-		    dev->shared_resources.reserved_doorbell_val) {
+-			set_bit(i, qpd->doorbell_bitmap);
+-			pr_debug("reserved doorbell 0x%03x\n", i);
+-		}
+-
+-	return 0;
+-}
+-
+ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+ 							struct kfd_process *p)
+ {
+@@ -652,9 +345,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
+ 
+ 	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+ 		if (pdd->dev == dev)
+-			return pdd;
++			break;
+ 
+-	return NULL;
++	return pdd;
+ }
+ 
+ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+@@ -663,41 +356,16 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 	struct kfd_process_device *pdd = NULL;
+ 
+ 	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
+-	if (!pdd)
+-		return NULL;
+-
+-	pdd->dev = dev;
+-	INIT_LIST_HEAD(&pdd->qpd.queues_list);
+-	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
+-	pdd->qpd.dqm = dev->dqm;
+-	pdd->qpd.pqm = &p->pqm;
+-	pdd->qpd.evicted = 0;
+-	pdd->process = p;
+-	pdd->bound = PDD_UNBOUND;
+-	pdd->already_dequeued = false;
+-	list_add(&pdd->per_device_list, &p->per_device_data);
+-
+-	/* Init idr used for memory handle translation */
+-	idr_init(&pdd->alloc_idr);
+-	if (init_doorbell_bitmap(&pdd->qpd, dev)) {
+-		pr_err("Failed to init doorbell for process\n");
+-		goto err_create_pdd;
++	if (pdd != NULL) {
++		pdd->dev = dev;
++		INIT_LIST_HEAD(&pdd->qpd.queues_list);
++		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
++		pdd->qpd.dqm = dev->dqm;
++		pdd->reset_wavefronts = false;
++		list_add(&pdd->per_device_list, &p->per_device_data);
+ 	}
+ 
+-	/* Create the GPUVM context for this specific device */
+-	if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm,
+-					&p->process_info, &p->ef)) {
+-		pr_err("Failed to create process VM object\n");
+-		goto err_create_pdd;
+-	}
+ 	return pdd;
+-
+-err_create_pdd:
+-	kfree(pdd->qpd.doorbell_bitmap);
+-	idr_destroy(&pdd->alloc_idr);
+-	list_del(&pdd->per_device_list);
+-	kfree(pdd);
+-	return NULL;
+ }
+ 
+ /*
+@@ -711,6 +379,7 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+ 							struct kfd_process *p)
+ {
+ 	struct kfd_process_device *pdd;
++	int err;
+ 
+ 	pdd = kfd_get_process_device_data(dev, p);
+ 	if (!pdd) {
+@@ -718,94 +387,19 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
+ 		return ERR_PTR(-ENOMEM);
+ 	}
+ 
+-	if (pdd->bound == PDD_BOUND) {
++	if (pdd->bound)
+ 		return pdd;
+-	} else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) {
+-		pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n");
+-		return ERR_PTR(-EINVAL);
+-	}
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	if (dev->device_info->is_need_iommu_device) {
+-		int err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+-					       p->lead_thread);
+-		if (err < 0)
+-			return ERR_PTR(err);
+-	}
+-#endif
++	err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread);
++	if (err < 0)
++		return ERR_PTR(err);
+ 
+-	pdd->bound = PDD_BOUND;
++	pdd->bound = true;
+ 
+ 	return pdd;
+ }
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-/*
+- * Bind processes do the device that have been temporarily unbound
+- * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device.
+- */
+-int kfd_bind_processes_to_device(struct kfd_dev *dev)
+-{
+-	struct kfd_process_device *pdd;
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int err = 0;
+-
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		mutex_lock(&p->mutex);
+-		pdd = kfd_get_process_device_data(dev, p);
+-		if (pdd->bound != PDD_BOUND_SUSPENDED) {
+-			mutex_unlock(&p->mutex);
+-			continue;
+-		}
+-
+-		err = amd_iommu_bind_pasid(dev->pdev, p->pasid,
+-				p->lead_thread);
+-		if (err < 0) {
+-			pr_err("Unexpected pasid %d binding failure\n",
+-					p->pasid);
+-			mutex_unlock(&p->mutex);
+-			break;
+-		}
+-
+-		pdd->bound = PDD_BOUND;
+-		mutex_unlock(&p->mutex);
+-	}
+-
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-
+-	return err;
+-}
+-
+-/*
+- * Mark currently bound processes as PDD_BOUND_SUSPENDED. These
+- * processes will be restored to PDD_BOUND state in
+- * kfd_bind_processes_to_device.
+- */
+-void kfd_unbind_processes_from_device(struct kfd_dev *dev)
+-{
+-	struct kfd_process_device *pdd;
+-	struct kfd_process *p;
+-	unsigned int temp;
+-
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		mutex_lock(&p->mutex);
+-		pdd = kfd_get_process_device_data(dev, p);
+-
+-		if (pdd->bound == PDD_BOUND)
+-			pdd->bound = PDD_BOUND_SUSPENDED;
+-		mutex_unlock(&p->mutex);
+-	}
+-
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-}
+-
+-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
++void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
+ {
+ 	struct kfd_process *p;
+ 	struct kfd_process_device *pdd;
+@@ -821,31 +415,34 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
+ 
+ 	pr_debug("Unbinding process %d from IOMMU\n", pasid);
+ 
+-	mutex_lock(kfd_get_dbgmgr_mutex());
++	if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
++		kfd_dbgmgr_destroy(dev->dbgmgr);
+ 
+-	if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) {
+-		if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) {
+-			kfd_dbgmgr_destroy(dev->dbgmgr);
+-			dev->dbgmgr = NULL;
+-		}
+-	}
++	pqm_uninit(&p->pqm);
+ 
+-	mutex_unlock(kfd_get_dbgmgr_mutex());
++	pdd = kfd_get_process_device_data(dev, p);
+ 
+-	mutex_lock(&p->mutex);
++	if (!pdd) {
++		mutex_unlock(&p->mutex);
++		return;
++	}
+ 
+-	pdd = kfd_get_process_device_data(dev, p);
+-	if (pdd)
+-		/* For GPU relying on IOMMU, we need to dequeue here
+-		 * when PASID is still bound.
+-		 */
+-		kfd_process_dequeue_from_device(pdd);
++	if (pdd->reset_wavefronts) {
++		dbgdev_wave_reset_wavefronts(pdd->dev, p);
++		pdd->reset_wavefronts = false;
++	}
+ 
+-	mutex_unlock(&p->mutex);
++	/*
++	 * Just mark pdd as unbound, because we still need it
++	 * to call amd_iommu_unbind_pasid() in when the
++	 * process exits.
++	 * We don't call amd_iommu_unbind_pasid() here
++	 * because the IOMMU called us.
++	 */
++	pdd->bound = false;
+ 
+-	kfd_unref_process(p);
++	mutex_unlock(&p->mutex);
+ }
+-#endif /* CONFIG_AMD_IOMMU_V2 */
+ 
+ struct kfd_process_device *kfd_get_first_process_device_data(
+ 						struct kfd_process *p)
+@@ -869,277 +466,22 @@ bool kfd_has_process_device_data(struct kfd_process *p)
+ 	return !(list_empty(&p->per_device_data));
+ }
+ 
+-/* Create specific handle mapped to mem from process local memory idr
+- * Assumes that the process lock is held.
+- */
+-int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
+-					void *mem, uint64_t start,
+-					uint64_t length,
+-					struct kfd_ipc_obj *ipc_obj)
+-{
+-	int handle;
+-	struct kfd_bo *buf_obj;
+-	struct kfd_process *p;
+-
+-	p = pdd->process;
+-
+-	buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL);
+-
+-	if (!buf_obj)
+-		return -ENOMEM;
+-
+-	buf_obj->it.start = start;
+-	buf_obj->it.last = start + length - 1;
+-	interval_tree_insert(&buf_obj->it, &p->bo_interval_tree);
+-
+-	buf_obj->mem = mem;
+-	buf_obj->dev = pdd->dev;
+-	buf_obj->kfd_ipc_obj = ipc_obj;
+-
+-	INIT_LIST_HEAD(&buf_obj->cb_data_head);
+-
+-	idr_preload(GFP_KERNEL);
+-
+-	handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID,
+-			GFP_NOWAIT);
+-
+-	idr_preload_end();
+-
+-	if (handle < 0)
+-		kfree(buf_obj);
+-
+-	return handle;
+-}
+-
+-struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd,
+-					int handle)
+-{
+-	if (handle < 0)
+-		return NULL;
+-
+-	return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle);
+-}
+-
+-/* Translate specific handle from process local memory idr
+- * Assumes that the process lock is held.
+- */
+-void *kfd_process_device_translate_handle(struct kfd_process_device *pdd,
+-					int handle)
+-{
+-	struct kfd_bo *buf_obj;
+-
+-	buf_obj = kfd_process_device_find_bo(pdd, handle);
+-
+-	return buf_obj->mem;
+-}
+-
+-void *kfd_process_find_bo_from_interval(struct kfd_process *p,
+-					uint64_t start_addr,
+-					uint64_t last_addr)
+-{
+-	struct interval_tree_node *it_node;
+-	struct kfd_bo *buf_obj;
+-
+-	it_node = interval_tree_iter_first(&p->bo_interval_tree,
+-			start_addr, last_addr);
+-	if (!it_node) {
+-		pr_err("0x%llx-0x%llx does not relate to an existing buffer\n",
+-				start_addr, last_addr);
+-		return NULL;
+-	}
+-
+-	if (interval_tree_iter_next(it_node, start_addr, last_addr)) {
+-		pr_err("0x%llx-0x%llx spans more than a single BO\n",
+-				start_addr, last_addr);
+-		return NULL;
+-	}
+-
+-	buf_obj = container_of(it_node, struct kfd_bo, it);
+-
+-	return buf_obj;
+-}
+-
+-/* Remove specific handle from process local memory idr
+- * Assumes that the process lock is held.
+- */
+-void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
+-					int handle)
+-{
+-	struct kfd_bo *buf_obj;
+-	struct kfd_process *p;
+-
+-	p = pdd->process;
+-
+-	if (handle < 0)
+-		return;
+-
+-	buf_obj = kfd_process_device_find_bo(pdd, handle);
+-
+-	if (buf_obj->kfd_ipc_obj)
+-		ipc_obj_put(&buf_obj->kfd_ipc_obj);
+-
+-	idr_remove(&pdd->alloc_idr, handle);
+-
+-	interval_tree_remove(&buf_obj->it, &p->bo_interval_tree);
+-
+-	kfree(buf_obj);
+-}
+-
+-/* This increments the process->ref counter. */
++/* This returns with process->mutex locked. */
+ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
+ {
+-	struct kfd_process *p, *ret_p = NULL;
++	struct kfd_process *p;
+ 	unsigned int temp;
+ 
+ 	int idx = srcu_read_lock(&kfd_processes_srcu);
+ 
+ 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ 		if (p->pasid == pasid) {
+-			kref_get(&p->ref);
+-			ret_p = p;
++			mutex_lock(&p->mutex);
+ 			break;
+ 		}
+ 	}
+ 
+ 	srcu_read_unlock(&kfd_processes_srcu, idx);
+ 
+-	return ret_p;
+-}
+-
+-void kfd_suspend_all_processes(void)
+-{
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		cancel_delayed_work_sync(&p->eviction_work);
+-		cancel_delayed_work_sync(&p->restore_work);
+-
+-		if (quiesce_process_mm(p))
+-			pr_err("Failed to suspend process %d\n", p->pasid);
+-		dma_fence_signal(p->ef);
+-		dma_fence_put(p->ef);
+-		p->ef = NULL;
+-	}
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-}
+-
+-int kfd_resume_all_processes(void)
+-{
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		if (!schedule_delayed_work(&p->restore_work, 0)) {
+-			pr_err("Restore process %d failed during resume\n",
+-			       p->pasid);
+-			ret = -EFAULT;
+-		}
+-	}
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-	return ret;
+-}
+-
+-/* This increments the process->ref counter. */
+-struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
+-{
+-	struct kfd_process *p;
+-
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	p = find_process_by_mm(mm);
+-	if (p)
+-		kref_get(&p->ref);
+-
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-
+ 	return p;
+ }
+-
+-int kfd_reserved_mem_mmap(struct kfd_process *process,
+-		struct vm_area_struct *vma)
+-{
+-	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
+-	struct kfd_process_device *pdd;
+-	struct qcm_process_device *qpd;
+-
+-	if (!dev)
+-		return -EINVAL;
+-	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
+-		pr_err("Incorrect CWSR mapping size.\n");
+-		return -EINVAL;
+-	}
+-
+-	pdd = kfd_get_process_device_data(dev, process);
+-	if (!pdd)
+-		return -EINVAL;
+-	qpd = &pdd->qpd;
+-
+-	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+-					get_order(KFD_CWSR_TBA_TMA_SIZE));
+-	if (!qpd->cwsr_kaddr) {
+-		pr_err("Error allocating per process CWSR buffer.\n");
+-		return -ENOMEM;
+-	}
+-
+-	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
+-		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+-	/* Mapping pages to user process */
+-	return remap_pfn_range(vma, vma->vm_start,
+-			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
+-			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
+-{
+-	struct kfd_process *p;
+-	unsigned int temp;
+-	int r = 0;
+-
+-	int idx = srcu_read_lock(&kfd_processes_srcu);
+-
+-	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+-		seq_printf(m, "Process %d PASID %d:\n",
+-			   p->lead_thread->tgid, p->pasid);
+-
+-		mutex_lock(&p->mutex);
+-		r = pqm_debugfs_mqds(m, &p->pqm);
+-		mutex_unlock(&p->mutex);
+-
+-		if (r != 0)
+-			break;
+-	}
+-
+-	srcu_read_unlock(&kfd_processes_srcu, idx);
+-
+-	return r;
+-}
+-
+-#endif
+-
+-void kfd_flush_tlb(struct kfd_dev *dev, struct kfd_process *p)
+-{
+-	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
+-
+-	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
+-		struct kfd_process_device *pdd =
+-				kfd_get_process_device_data(dev, p);
+-		if (!pdd) {
+-			pr_err("could not find pdd for pasid %d\n", p->pasid);
+-			return;
+-		}
+-
+-		/* vmid allocation is delayed to the creation of the first
+-		 * queue of the process. For buffers allocated and mapped
+-		 * before queue creation, vmid is still no allocated (valued 0).
+-		 * Ignore tlb invalidation request for this case.
+-		 */
+-		if (pdd->qpd.vmid)
+-			f2g->invalidate_tlbs_vmid(dev->kgd, pdd->qpd.vmid);
+-	} else
+-		f2g->invalidate_tlbs(dev->kgd, p->pasid);
+-}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+index a7ec177..5f82905 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+@@ -63,25 +63,6 @@ static int find_available_queue_slot(struct process_queue_manager *pqm,
+ 	return 0;
+ }
+ 
+-void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
+-{
+-	struct kfd_dev *dev = pdd->dev;
+-
+-	if (pdd->already_dequeued)
+-		return;
+-
+-	dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
+-	pdd->already_dequeued = true;
+-}
+-
+-void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
+-{
+-	struct kfd_process_device *pdd;
+-
+-	list_for_each_entry(pdd, &p->per_device_data, per_device_list)
+-		kfd_process_dequeue_from_device(pdd);
+-}
+-
+ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
+ {
+ 	INIT_LIST_HEAD(&pqm->queues);
+@@ -97,14 +78,21 @@ int pqm_init(struct process_queue_manager *pqm, struct kfd_process *p)
+ 
+ void pqm_uninit(struct process_queue_manager *pqm)
+ {
++	int retval;
+ 	struct process_queue_node *pqn, *next;
+ 
+ 	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
+-		uninit_queue(pqn->q);
+-		list_del(&pqn->process_queue_list);
+-		kfree(pqn);
++		retval = pqm_destroy_queue(
++				pqm,
++				(pqn->q != NULL) ?
++					pqn->q->properties.queue_id :
++					pqn->kq->queue->properties.queue_id);
++
++		if (retval != 0) {
++			pr_err("failed to destroy queue\n");
++			return;
++		}
+ 	}
+-
+ 	kfree(pqm->queue_slot_bitmap);
+ 	pqm->queue_slot_bitmap = NULL;
+ }
+@@ -119,6 +107,9 @@ static int create_cp_queue(struct process_queue_manager *pqm,
+ 	/* Doorbell initialized in user space*/
+ 	q_properties->doorbell_ptr = NULL;
+ 
++	q_properties->doorbell_off =
++			kfd_queue_id_to_doorbell(dev, pqm->process, qid);
++
+ 	/* let DQM handle it*/
+ 	q_properties->vmid = 0;
+ 	q_properties->queue_id = qid;
+@@ -139,16 +130,20 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 			    struct kfd_dev *dev,
+ 			    struct file *f,
+ 			    struct queue_properties *properties,
++			    unsigned int flags,
++			    enum kfd_queue_type type,
+ 			    unsigned int *qid)
+ {
+ 	int retval;
+ 	struct kfd_process_device *pdd;
++	struct queue_properties q_properties;
+ 	struct queue *q;
+ 	struct process_queue_node *pqn;
+ 	struct kernel_queue *kq;
+-	enum kfd_queue_type type = properties->type;
+-	unsigned int max_queues = 127; /* HWS limit */
++	int num_queues = 0;
++	struct queue *cur;
+ 
++	memcpy(&q_properties, properties, sizeof(struct queue_properties));
+ 	q = NULL;
+ 	kq = NULL;
+ 
+@@ -164,19 +159,22 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 	 * If we are just about to create DIQ, the is_debug flag is not set yet
+ 	 * Hence we also check the type as well
+ 	 */
+-	if ((pdd->qpd.is_debug) || (type == KFD_QUEUE_TYPE_DIQ))
+-		max_queues = dev->device_info->max_no_of_hqd/2;
+-
+-	if (pdd->qpd.queue_count >= max_queues)
+-		return -ENOSPC;
++	if ((pdd->qpd.is_debug) ||
++		(type == KFD_QUEUE_TYPE_DIQ)) {
++		list_for_each_entry(cur, &pdd->qpd.queues_list, list)
++			num_queues++;
++		if (num_queues >= dev->device_info->max_no_of_hqd/2)
++			return -ENOSPC;
++	}
+ 
+ 	retval = find_available_queue_slot(pqm, qid);
+ 	if (retval != 0)
+ 		return retval;
+ 
+-	if (list_empty(&pdd->qpd.queues_list) &&
+-	    list_empty(&pdd->qpd.priv_queue_list))
++	if (list_empty(&pqm->queues)) {
++		pdd->qpd.pqm = pqm;
+ 		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
++	}
+ 
+ 	pqn = kzalloc(sizeof(*pqn), GFP_KERNEL);
+ 	if (!pqn) {
+@@ -186,40 +184,23 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 
+ 	switch (type) {
+ 	case KFD_QUEUE_TYPE_SDMA:
+-                if (dev->dqm->sdma_queue_count
+-                        >= get_num_sdma_queues(dev->dqm)) {
+-			pr_debug("Over-subscription is not allowed for SDMA\n");
+-                        retval = -EPERM;
+-                        goto err_create_queue;
+-                }
+-
+-                retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
+-                if (retval != 0)
+-                        goto err_create_queue;
+-                pqn->q = q;
+-                pqn->kq = NULL;
+-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
+-                pr_debug("DQM returned %d for create_queue\n", retval);
+-                print_queue(q);
+-                break;
+-
+ 	case KFD_QUEUE_TYPE_COMPUTE:
+ 		/* check if there is over subscription */
+-		if ((dev->dqm->sched_policy ==
+-				KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
+-		((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) ||
++                if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
++                ((dev->dqm->processes_count >= VMID_PER_DEVICE) ||
+ 		(dev->dqm->queue_count >= get_queues_num(dev->dqm)))) {
+-			pr_debug("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
++		pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n");
+ 			retval = -EPERM;
+ 			goto err_create_queue;
+ 		}
+ 
+-		retval = create_cp_queue(pqm, dev, &q, properties, f, *qid);
++		retval = create_cp_queue(pqm, dev, &q, &q_properties, f, *qid);
+ 		if (retval != 0)
+ 			goto err_create_queue;
+ 		pqn->q = q;
+ 		pqn->kq = NULL;
+-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
++                retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
++                                                &q->properties.vmid);
+ 		pr_debug("DQM returned %d for create_queue\n", retval);
+ 		print_queue(q);
+ 		break;
+@@ -245,22 +226,14 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ 		goto err_create_queue;
+ 	}
+ 
+-	if (q)
+-		/* Return the doorbell offset within the doorbell page
+-		 * to the caller so it can be passed up to user mode
+-		 * (in bytes).
+-		 */
+-		properties->doorbell_off =
+-			(q->properties.doorbell_off * sizeof(uint32_t)) &
+-			(kfd_doorbell_process_slice(dev) - 1);
+-
+ 	pr_debug("PQM After DQM create queue\n");
+ 
+ 	list_add(&pqn->process_queue_list, &pqm->queues);
+ 
+ 	if (q) {
++		*properties = q->properties;
+ 		pr_debug("PQM done creating queue\n");
+-		print_queue_properties(&q->properties);
++		print_queue_properties(properties);
+ 	}
+ 
+ 	return retval;
+@@ -270,8 +243,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
+ err_allocate_pqn:
+ 	/* check if queues list is empty unregister process from device */
+ 	clear_bit(*qid, pqm->queue_slot_bitmap);
+-	if (list_empty(&pdd->qpd.queues_list) &&
+-	    list_empty(&pdd->qpd.priv_queue_list))
++	if (list_empty(&pqm->queues))
+ 		dev->dqm->ops.unregister_process(dev->dqm, &pdd->qpd);
+ 	return retval;
+ }
+@@ -317,13 +289,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 
+ 	if (pqn->q) {
+ 		dqm = pqn->q->device->dqm;
+-		kfree(pqn->q->properties.cu_mask);
+-		pqn->q->properties.cu_mask = NULL;
+ 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+-		if (retval) {
+-			pr_debug("Destroy queue failed, returned %d\n", retval);
+-			goto err_destroy_queue;
+-		}
++		if (retval != 0)
++			return retval;
++
+ 		uninit_queue(pqn->q);
+ 	}
+ 
+@@ -331,11 +300,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
+ 	kfree(pqn);
+ 	clear_bit(qid, pqm->queue_slot_bitmap);
+ 
+-	if (list_empty(&pdd->qpd.queues_list) &&
+-	    list_empty(&pdd->qpd.priv_queue_list))
++	if (list_empty(&pqm->queues))
+ 		dqm->ops.unregister_process(dqm, &pdd->qpd);
+ 
+-err_destroy_queue:
+ 	return retval;
+ }
+ 
+@@ -364,34 +331,6 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
+ 	return 0;
+ }
+ 
+-int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
+-			struct queue_properties *p)
+-{
+-	int retval;
+-	struct process_queue_node *pqn;
+-
+-	pqn = get_queue_by_qid(pqm, qid);
+-	if (!pqn) {
+-		pr_debug("No queue %d exists for update operation\n", qid);
+-		return -EFAULT;
+-	}
+-
+-	/* Free the old CU mask memory if it is already allocated, then
+-	 * allocate memory for the new CU mask.
+-	 */
+-	kfree(pqn->q->properties.cu_mask);
+-
+-	pqn->q->properties.cu_mask_count = p->cu_mask_count;
+-	pqn->q->properties.cu_mask = p->cu_mask;
+-
+-	retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+-							pqn->q);
+-	if (retval != 0)
+-		return retval;
+-
+-	return 0;
+-}
+-
+ struct kernel_queue *pqm_get_kernel_queue(
+ 					struct process_queue_manager *pqm,
+ 					unsigned int qid)
+@@ -405,89 +344,4 @@ struct kernel_queue *pqm_get_kernel_queue(
+ 	return NULL;
+ }
+ 
+-int pqm_get_wave_state(struct process_queue_manager *pqm,
+-		       unsigned int qid,
+-		       void __user *ctl_stack,
+-		       u32 *ctl_stack_used_size,
+-		       u32 *save_area_used_size)
+-{
+-	struct process_queue_node *pqn;
+-
+-	pqn = get_queue_by_qid(pqm, qid);
+-	if (!pqn) {
+-		pr_debug("amdkfd: No queue %d exists for operation\n",
+-			 qid);
+-		return -EFAULT;
+-	}
+-
+-	return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
+-						       pqn->q,
+-						       ctl_stack,
+-						       ctl_stack_used_size,
+-						       save_area_used_size);
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-int pqm_debugfs_mqds(struct seq_file *m, void *data)
+-{
+-	struct process_queue_manager *pqm = data;
+-	struct process_queue_node *pqn;
+-	struct queue *q;
+-	enum KFD_MQD_TYPE mqd_type;
+-	struct mqd_manager *mqd_manager;
+-	int r = 0;
+-
+-	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+-		if (pqn->q) {
+-			q = pqn->q;
+-			switch (q->properties.type) {
+-			case KFD_QUEUE_TYPE_SDMA:
+-				seq_printf(m, "  SDMA queue on device %x\n",
+-					   q->device->id);
+-				mqd_type = KFD_MQD_TYPE_SDMA;
+-				break;
+-			case KFD_QUEUE_TYPE_COMPUTE:
+-				seq_printf(m, "  Compute queue on device %x\n",
+-					   q->device->id);
+-				mqd_type = KFD_MQD_TYPE_CP;
+-				break;
+-			default:
+-				seq_printf(m,
+-				"  Bad user queue type %d on device %x\n",
+-					   q->properties.type, q->device->id);
+-				continue;
+-			}
+-			mqd_manager = q->device->dqm->ops.get_mqd_manager(
+-				q->device->dqm, mqd_type);
+-		} else if (pqn->kq) {
+-			q = pqn->kq->queue;
+-			mqd_manager = pqn->kq->mqd;
+-			switch (q->properties.type) {
+-			case KFD_QUEUE_TYPE_DIQ:
+-				seq_printf(m, "  DIQ on device %x\n",
+-					   pqn->kq->dev->id);
+-				mqd_type = KFD_MQD_TYPE_HIQ;
+-				break;
+-			default:
+-				seq_printf(m,
+-				"  Bad kernel queue type %d on device %x\n",
+-					   q->properties.type,
+-					   pqn->kq->dev->id);
+-				continue;
+-			}
+-		} else {
+-			seq_printf(m,
+-		"  Weird: Queue node with neither kernel nor user queue\n");
+-			continue;
+-		}
+-
+-		r = mqd_manager->debugfs_show_mqd(m, q->mqd);
+-		if (r != 0)
+-			break;
+-	}
+-
+-	return r;
+-}
+ 
+-#endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
+deleted file mode 100644
+index 2f5cdb9..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c
++++ /dev/null
+@@ -1,294 +0,0 @@
+-/*
+- * Copyright 2015 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#include <linux/device.h>
+-#include <linux/export.h>
+-#include <linux/pid.h>
+-#include <linux/err.h>
+-#include <linux/slab.h>
+-#include "kfd_priv.h"
+-
+-
+-struct rdma_cb {
+-	struct list_head node;
+-	struct amd_p2p_info amd_p2p_data;
+-	void  (*free_callback)(void *client_priv);
+-	void  *client_priv;
+-};
+-
+-/**
+- * This function makes the pages underlying a range of GPU virtual memory
+- * accessible for DMA operations from another PCIe device
+- *
+- * \param   address       - The start address in the Unified Virtual Address
+- *			    space in the specified process
+- * \param   length        - The length of requested mapping
+- * \param   pid           - Pointer to structure pid to which address belongs.
+- *			    Could be NULL for current process address space.
+- * \param   p2p_data    - On return: Pointer to structure describing
+- *			    underlying pages/locations
+- * \param   free_callback - Pointer to callback which will be called when access
+- *			    to such memory must be stopped immediately: Memory
+- *			    was freed, GECC events, etc.
+- *			    Client should  immediately stop any transfer
+- *			    operations and returned as soon as possible.
+- *			    After return all resources associated with address
+- *			    will be release and no access will be allowed.
+- * \param   client_priv   - Pointer to be passed as parameter on
+- *			    'free_callback;
+- *
+- * \return  0 if operation was successful
+- */
+-static int get_pages(uint64_t address, uint64_t length, struct pid *pid,
+-		struct amd_p2p_info **amd_p2p_data,
+-		void  (*free_callback)(void *client_priv),
+-		void  *client_priv)
+-{
+-	struct kfd_bo *buf_obj;
+-	struct kgd_mem *mem;
+-	struct sg_table *sg_table_tmp;
+-	struct kfd_dev *dev;
+-	uint64_t last = address + length - 1;
+-	uint64_t offset;
+-	struct kfd_process *p;
+-	struct rdma_cb *rdma_cb_data;
+-	int ret = 0;
+-
+-	p = kfd_lookup_process_by_pid(pid);
+-	if (!p) {
+-		pr_err("Could not find the process\n");
+-		return -EINVAL;
+-	}
+-	mutex_lock(&p->mutex);
+-
+-	buf_obj = kfd_process_find_bo_from_interval(p, address, last);
+-	if (!buf_obj) {
+-		pr_err("Cannot find a kfd_bo for the range\n");
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL);
+-	if (!rdma_cb_data) {
+-		*amd_p2p_data = NULL;
+-		ret = -ENOMEM;
+-		goto out;
+-	}
+-
+-	mem = buf_obj->mem;
+-	dev = buf_obj->dev;
+-	offset = address - buf_obj->it.start;
+-
+-	ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem,
+-			offset, length, &sg_table_tmp);
+-
+-	if (ret) {
+-		pr_err("pin_get_sg_table_bo failed.\n");
+-		*amd_p2p_data = NULL;
+-		goto free_mem;
+-	}
+-
+-	rdma_cb_data->amd_p2p_data.va = address;
+-	rdma_cb_data->amd_p2p_data.size = length;
+-	rdma_cb_data->amd_p2p_data.pid = pid;
+-	rdma_cb_data->amd_p2p_data.priv = buf_obj;
+-	rdma_cb_data->amd_p2p_data.pages = sg_table_tmp;
+-
+-	rdma_cb_data->free_callback = free_callback;
+-	rdma_cb_data->client_priv = client_priv;
+-
+-	list_add(&rdma_cb_data->node, &buf_obj->cb_data_head);
+-
+-	*amd_p2p_data = &rdma_cb_data->amd_p2p_data;
+-
+-	goto out;
+-
+-free_mem:
+-	kfree(rdma_cb_data);
+-out:
+-	mutex_unlock(&p->mutex);
+-	kfd_unref_process(p);
+-
+-	return ret;
+-}
+-
+-static int put_pages_helper(struct amd_p2p_info *p2p_data)
+-{
+-	struct kfd_bo *buf_obj;
+-	struct kfd_dev *dev;
+-	struct sg_table *sg_table_tmp;
+-	struct rdma_cb *rdma_cb_data;
+-
+-	if (!p2p_data) {
+-		pr_err("amd_p2p_info pointer is invalid.\n");
+-		return -EINVAL;
+-	}
+-
+-	rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data);
+-
+-	buf_obj = p2p_data->priv;
+-	dev = buf_obj->dev;
+-	sg_table_tmp = p2p_data->pages;
+-
+-	list_del(&rdma_cb_data->node);
+-	kfree(rdma_cb_data);
+-
+-	dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp);
+-
+-
+-	return 0;
+-}
+-
+-void run_rdma_free_callback(struct kfd_bo *buf_obj)
+-{
+-	struct rdma_cb *tmp, *rdma_cb_data;
+-
+-	list_for_each_entry_safe(rdma_cb_data, tmp,
+-			&buf_obj->cb_data_head, node) {
+-		if (rdma_cb_data->free_callback)
+-			rdma_cb_data->free_callback(
+-					rdma_cb_data->client_priv);
+-
+-		put_pages_helper(&rdma_cb_data->amd_p2p_data);
+-	}
+-}
+-
+-/**
+- *
+- * This function release resources previously allocated by get_pages() call.
+- *
+- * \param   p_p2p_data - A pointer to pointer to amd_p2p_info entries
+- *			allocated by get_pages() call.
+- *
+- * \return  0 if operation was successful
+- */
+-static int put_pages(struct amd_p2p_info **p_p2p_data)
+-{
+-	struct kfd_process *p = NULL;
+-	int ret = 0;
+-
+-	if (!(*p_p2p_data)) {
+-		pr_err("amd_p2p_info pointer is invalid.\n");
+-		return -EINVAL;
+-	}
+-
+-	p = kfd_lookup_process_by_pid((*p_p2p_data)->pid);
+-	if (!p) {
+-		pr_err("Could not find the process\n");
+-		return -EINVAL;
+-	}
+-
+-	ret = put_pages_helper(*p_p2p_data);
+-
+-	if (!ret)
+-		*p_p2p_data = NULL;
+-
+-	kfd_unref_process(p);
+-
+-	return ret;
+-}
+-
+-/**
+- * Check if given address belongs to GPU address space.
+- *
+- * \param   address - Address to check
+- * \param   pid     - Process to which given address belongs.
+- *		      Could be NULL if current one.
+- *
+- * \return  0  - This is not GPU address managed by AMD driver
+- *	    1  - This is GPU address managed by AMD driver
+- */
+-static int is_gpu_address(uint64_t address, struct pid *pid)
+-{
+-	struct kfd_bo *buf_obj;
+-	struct kfd_process *p;
+-
+-	p = kfd_lookup_process_by_pid(pid);
+-	if (!p) {
+-		pr_debug("Could not find the process\n");
+-		return 0;
+-	}
+-
+-	buf_obj = kfd_process_find_bo_from_interval(p, address, address);
+-
+-	kfd_unref_process(p);
+-	if (!buf_obj)
+-		return 0;
+-
+-	return 1;
+-}
+-
+-/**
+- * Return the single page size to be used when building scatter/gather table
+- * for given range.
+- *
+- * \param   address   - Address
+- * \param   length    - Range length
+- * \param   pid       - Process id structure. Could be NULL if current one.
+- * \param   page_size - On return: Page size
+- *
+- * \return  0 if operation was successful
+- */
+-static int get_page_size(uint64_t address, uint64_t length, struct pid *pid,
+-			unsigned long *page_size)
+-{
+-	/*
+-	 * As local memory is always consecutive, we can assume the local
+-	 * memory page size to be arbitrary.
+-	 * Currently we assume the local memory page size to be the same
+-	 * as system memory, which is 4KB.
+-	 */
+-	*page_size = PAGE_SIZE;
+-
+-	return 0;
+-}
+-
+-
+-/**
+- * Singleton object: rdma interface function pointers
+- */
+-static const struct amd_rdma_interface  rdma_ops = {
+-	.get_pages = get_pages,
+-	.put_pages = put_pages,
+-	.is_gpu_address = is_gpu_address,
+-	.get_page_size = get_page_size,
+-};
+-
+-/**
+- * amdkfd_query_rdma_interface - Return interface (function pointers table) for
+- *				 rdma interface
+- *
+- *
+- * \param interace     - OUT: Pointer to interface
+- *
+- * \return 0 if operation was successful.
+- */
+-int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops)
+-{
+-	*ops  = &rdma_ops;
+-
+-	return 0;
+-}
+-EXPORT_SYMBOL(amdkfd_query_rdma_interface);
+-
+-
+-
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+index 58a5bef..19ce590 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+@@ -28,32 +28,27 @@
+ #include <linux/hash.h>
+ #include <linux/cpufreq.h>
+ #include <linux/log2.h>
+-#include <linux/dmi.h>
+-#include <linux/atomic.h>
+ 
+ #include "kfd_priv.h"
+ #include "kfd_crat.h"
+ #include "kfd_topology.h"
+-#include "kfd_device_queue_manager.h"
+ 
+-/* topology_device_list - Master list of all topology devices */
+ static struct list_head topology_device_list;
++static int topology_crat_parsed;
+ static struct kfd_system_properties sys_props;
+ 
+ static DECLARE_RWSEM(topology_lock);
+-static atomic_t topology_crat_proximity_domain;
+ 
+-struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+-						uint32_t proximity_domain)
++struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+ {
+ 	struct kfd_topology_device *top_dev;
+-	struct kfd_topology_device *device = NULL;
++	struct kfd_dev *device = NULL;
+ 
+ 	down_read(&topology_lock);
+ 
+ 	list_for_each_entry(top_dev, &topology_device_list, list)
+-		if (top_dev->proximity_domain == proximity_domain) {
+-			device = top_dev;
++		if (top_dev->gpu_id == gpu_id) {
++			device = top_dev->gpu;
+ 			break;
+ 		}
+ 
+@@ -62,7 +57,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+ 	return device;
+ }
+ 
+-struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
++struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
+ {
+ 	struct kfd_topology_device *top_dev;
+ 	struct kfd_dev *device = NULL;
+@@ -70,7 +65,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+ 	down_read(&topology_lock);
+ 
+ 	list_for_each_entry(top_dev, &topology_device_list, list)
+-		if (top_dev->gpu_id == gpu_id) {
++		if (top_dev->gpu->pdev == pdev) {
+ 			device = top_dev->gpu;
+ 			break;
+ 		}
+@@ -80,49 +75,282 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+ 	return device;
+ }
+ 
+-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
++static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size)
+ {
+-	struct kfd_topology_device *top_dev;
+-	struct kfd_dev *device = NULL;
++	struct acpi_table_header *crat_table;
++	acpi_status status;
+ 
+-	down_read(&topology_lock);
++	if (!size)
++		return -EINVAL;
+ 
+-	list_for_each_entry(top_dev, &topology_device_list, list)
+-		if (top_dev->gpu && top_dev->gpu->pdev == pdev) {
+-			device = top_dev->gpu;
++	/*
++	 * Fetch the CRAT table from ACPI
++	 */
++	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
++	if (status == AE_NOT_FOUND) {
++		pr_warn("CRAT table not found\n");
++		return -ENODATA;
++	} else if (ACPI_FAILURE(status)) {
++		const char *err = acpi_format_exception(status);
++
++		pr_err("CRAT table error: %s\n", err);
++		return -EINVAL;
++	}
++
++	if (*size >= crat_table->length && crat_image != NULL)
++		memcpy(crat_image, crat_table, crat_table->length);
++
++	*size = crat_table->length;
++
++	return 0;
++}
++
++static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
++		struct crat_subtype_computeunit *cu)
++{
++	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
++	dev->node_props.cpu_core_id_base = cu->processor_id_low;
++	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
++		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
++
++	pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
++			cu->processor_id_low);
++}
++
++static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
++		struct crat_subtype_computeunit *cu)
++{
++	dev->node_props.simd_id_base = cu->processor_id_low;
++	dev->node_props.simd_count = cu->num_simd_cores;
++	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
++	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
++	dev->node_props.wave_front_size = cu->wave_front_size;
++	dev->node_props.mem_banks_count = cu->num_banks;
++	dev->node_props.array_count = cu->num_arrays;
++	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
++	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
++	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
++	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
++		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
++	pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores,
++				cu->processor_id_low);
++}
++
++/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */
++static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu)
++{
++	struct kfd_topology_device *dev;
++	int i = 0;
++
++	pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
++			cu->proximity_domain, cu->hsa_capability);
++	list_for_each_entry(dev, &topology_device_list, list) {
++		if (cu->proximity_domain == i) {
++			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
++				kfd_populated_cu_info_cpu(dev, cu);
++
++			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
++				kfd_populated_cu_info_gpu(dev, cu);
+ 			break;
+ 		}
++		i++;
++	}
+ 
+-	up_read(&topology_lock);
++	return 0;
++}
+ 
+-	return device;
++/*
++ * kfd_parse_subtype_mem is called when the topology mutex is
++ * already acquired
++ */
++static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem)
++{
++	struct kfd_mem_properties *props;
++	struct kfd_topology_device *dev;
++	int i = 0;
++
++	pr_info("Found memory entry in CRAT table with proximity_domain=%d\n",
++			mem->promixity_domain);
++	list_for_each_entry(dev, &topology_device_list, list) {
++		if (mem->promixity_domain == i) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			if (dev->node_props.cpu_cores_count == 0)
++				props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE;
++			else
++				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
++
++			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
++				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
++			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
++				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
++
++			props->size_in_bytes =
++				((uint64_t)mem->length_high << 32) +
++							mem->length_low;
++			props->width = mem->width;
++
++			dev->mem_bank_count++;
++			list_add_tail(&props->list, &dev->mem_props);
++
++			break;
++		}
++		i++;
++	}
++
++	return 0;
+ }
+ 
+-struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd)
++/*
++ * kfd_parse_subtype_cache is called when the topology mutex
++ * is already acquired
++ */
++static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache)
+ {
+-	struct kfd_topology_device *top_dev;
+-	struct kfd_dev *device = NULL;
++	struct kfd_cache_properties *props;
++	struct kfd_topology_device *dev;
++	uint32_t id;
+ 
+-	down_read(&topology_lock);
++	id = cache->processor_id_low;
++
++	pr_info("Found cache entry in CRAT table with processor_id=%d\n", id);
++	list_for_each_entry(dev, &topology_device_list, list)
++		if (id == dev->node_props.cpu_core_id_base ||
++		    id == dev->node_props.simd_id_base) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			props->processor_id_low = id;
++			props->cache_level = cache->cache_level;
++			props->cache_size = cache->cache_size;
++			props->cacheline_size = cache->cache_line_size;
++			props->cachelines_per_tag = cache->lines_per_tag;
++			props->cache_assoc = cache->associativity;
++			props->cache_latency = cache->cache_latency;
++
++			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_DATA;
++			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
++			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_CPU;
++			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
++				props->cache_type |= HSA_CACHE_TYPE_HSACU;
++
++			dev->cache_count++;
++			dev->node_props.caches_count++;
++			list_add_tail(&props->list, &dev->cache_props);
+ 
+-	list_for_each_entry(top_dev, &topology_device_list, list)
+-		if (top_dev->gpu && top_dev->gpu->kgd == kgd) {
+-			device = top_dev->gpu;
+ 			break;
+ 		}
+ 
+-	up_read(&topology_lock);
++	return 0;
++}
+ 
+-	return device;
++/*
++ * kfd_parse_subtype_iolink is called when the topology mutex
++ * is already acquired
++ */
++static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink)
++{
++	struct kfd_iolink_properties *props;
++	struct kfd_topology_device *dev;
++	uint32_t i = 0;
++	uint32_t id_from;
++	uint32_t id_to;
++
++	id_from = iolink->proximity_domain_from;
++	id_to = iolink->proximity_domain_to;
++
++	pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from);
++	list_for_each_entry(dev, &topology_device_list, list) {
++		if (id_from == i) {
++			props = kfd_alloc_struct(props);
++			if (props == NULL)
++				return -ENOMEM;
++
++			props->node_from = id_from;
++			props->node_to = id_to;
++			props->ver_maj = iolink->version_major;
++			props->ver_min = iolink->version_minor;
++
++			/*
++			 * weight factor (derived from CDIR), currently always 1
++			 */
++			props->weight = 1;
++
++			props->min_latency = iolink->minimum_latency;
++			props->max_latency = iolink->maximum_latency;
++			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
++			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
++			props->rec_transfer_size =
++					iolink->recommended_transfer_size;
++
++			dev->io_link_count++;
++			dev->node_props.io_links_count++;
++			list_add_tail(&props->list, &dev->io_link_props);
++
++			break;
++		}
++		i++;
++	}
++
++	return 0;
++}
++
++static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr)
++{
++	struct crat_subtype_computeunit *cu;
++	struct crat_subtype_memory *mem;
++	struct crat_subtype_cache *cache;
++	struct crat_subtype_iolink *iolink;
++	int ret = 0;
++
++	switch (sub_type_hdr->type) {
++	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
++		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
++		ret = kfd_parse_subtype_cu(cu);
++		break;
++	case CRAT_SUBTYPE_MEMORY_AFFINITY:
++		mem = (struct crat_subtype_memory *)sub_type_hdr;
++		ret = kfd_parse_subtype_mem(mem);
++		break;
++	case CRAT_SUBTYPE_CACHE_AFFINITY:
++		cache = (struct crat_subtype_cache *)sub_type_hdr;
++		ret = kfd_parse_subtype_cache(cache);
++		break;
++	case CRAT_SUBTYPE_TLB_AFFINITY:
++		/*
++		 * For now, nothing to do here
++		 */
++		pr_info("Found TLB entry in CRAT table (not processing)\n");
++		break;
++	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
++		/*
++		 * For now, nothing to do here
++		 */
++		pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n");
++		break;
++	case CRAT_SUBTYPE_IOLINK_AFFINITY:
++		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
++		ret = kfd_parse_subtype_iolink(iolink);
++		break;
++	default:
++		pr_warn("Unknown subtype (%d) in CRAT\n",
++				sub_type_hdr->type);
++	}
++
++	return ret;
+ }
+ 
+-/* Called with write topology_lock acquired */
+ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ {
+ 	struct kfd_mem_properties *mem;
+ 	struct kfd_cache_properties *cache;
+ 	struct kfd_iolink_properties *iolink;
+-	struct kfd_perf_properties *perf;
+ 
+ 	list_del(&dev->list);
+ 
+@@ -147,35 +375,25 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
+ 		kfree(iolink);
+ 	}
+ 
+-	while (dev->perf_props.next != &dev->perf_props) {
+-		perf = container_of(dev->perf_props.next,
+-				struct kfd_perf_properties, list);
+-		list_del(&perf->list);
+-		kfree(perf);
+-	}
+-
+ 	kfree(dev);
++
++	sys_props.num_devices--;
+ }
+ 
+-void kfd_release_topology_device_list(struct list_head *device_list)
++static void kfd_release_live_view(void)
+ {
+ 	struct kfd_topology_device *dev;
+ 
+-	while (!list_empty(device_list)) {
+-		dev = list_first_entry(device_list,
+-				       struct kfd_topology_device, list);
++	while (topology_device_list.next != &topology_device_list) {
++		dev = container_of(topology_device_list.next,
++				 struct kfd_topology_device, list);
+ 		kfd_release_topology_device(dev);
+-	}
+ }
+ 
+-static void kfd_release_live_view(void)
+-{
+-	kfd_release_topology_device_list(&topology_device_list);
+ 	memset(&sys_props, 0, sizeof(sys_props));
+ }
+ 
+-struct kfd_topology_device *kfd_create_topology_device(
+-				struct list_head *device_list)
++static struct kfd_topology_device *kfd_create_topology_device(void)
+ {
+ 	struct kfd_topology_device *dev;
+ 
+@@ -188,13 +406,66 @@ struct kfd_topology_device *kfd_create_topology_device(
+ 	INIT_LIST_HEAD(&dev->mem_props);
+ 	INIT_LIST_HEAD(&dev->cache_props);
+ 	INIT_LIST_HEAD(&dev->io_link_props);
+-	INIT_LIST_HEAD(&dev->perf_props);
+ 
+-	list_add_tail(&dev->list, device_list);
++	list_add_tail(&dev->list, &topology_device_list);
++	sys_props.num_devices++;
+ 
+ 	return dev;
+ }
+ 
++static int kfd_parse_crat_table(void *crat_image)
++{
++	struct kfd_topology_device *top_dev;
++	struct crat_subtype_generic *sub_type_hdr;
++	uint16_t node_id;
++	int ret;
++	struct crat_header *crat_table = (struct crat_header *)crat_image;
++	uint16_t num_nodes;
++	uint32_t image_len;
++
++	if (!crat_image)
++		return -EINVAL;
++
++	num_nodes = crat_table->num_domains;
++	image_len = crat_table->length;
++
++	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
++
++	for (node_id = 0; node_id < num_nodes; node_id++) {
++		top_dev = kfd_create_topology_device();
++		if (!top_dev) {
++			kfd_release_live_view();
++			return -ENOMEM;
++		}
++	}
++
++	sys_props.platform_id =
++		(*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK;
++	sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id);
++	sys_props.platform_rev = crat_table->revision;
++
++	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
++	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
++			((char *)crat_image) + image_len) {
++		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
++			ret = kfd_parse_subtype(sub_type_hdr);
++			if (ret != 0) {
++				kfd_release_live_view();
++				return ret;
++			}
++		}
++
++		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
++				sub_type_hdr->length);
++	}
++
++	sys_props.generation_count++;
++	topology_crat_parsed = 1;
++
++	return 0;
++}
++
++
+ #define sysfs_show_gen_prop(buffer, fmt, ...) \
+ 		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
+ #define sysfs_show_32bit_prop(buffer, name, value) \
+@@ -203,8 +474,6 @@ struct kfd_topology_device *kfd_create_topology_device(
+ 		sysfs_show_gen_prop(buffer, "%s %llu\n", name, value)
+ #define sysfs_show_32bit_val(buffer, value) \
+ 		sysfs_show_gen_prop(buffer, "%u\n", value)
+-#define sysfs_show_64bit_val(buffer, value) \
+-		sysfs_show_gen_prop(buffer, "%llu\n", value)
+ #define sysfs_show_str_val(buffer, value) \
+ 		sysfs_show_gen_prop(buffer, "%s\n", value)
+ 
+@@ -232,17 +501,11 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr,
+ 	return ret;
+ }
+ 
+-static void kfd_topology_kobj_release(struct kobject *kobj)
+-{
+-	kfree(kobj);
+-}
+-
+ static const struct sysfs_ops sysprops_ops = {
+ 	.show = sysprops_show,
+ };
+ 
+ static struct kobj_type sysprops_type = {
+-	.release = kfd_topology_kobj_release,
+ 	.sysfs_ops = &sysprops_ops,
+ };
+ 
+@@ -278,7 +541,6 @@ static const struct sysfs_ops iolink_ops = {
+ };
+ 
+ static struct kobj_type iolink_type = {
+-	.release = kfd_topology_kobj_release,
+ 	.sysfs_ops = &iolink_ops,
+ };
+ 
+@@ -287,23 +549,11 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr,
+ {
+ 	ssize_t ret;
+ 	struct kfd_mem_properties *mem;
+-	uint64_t used_mem;
+ 
+ 	/* Making sure that the buffer is an empty string */
+ 	buffer[0] = 0;
+ 
+-	if (strcmp(attr->name, "used_memory") == 0) {
+-		mem = container_of(attr, struct kfd_mem_properties,
+-				attr_used);
+-		if (mem->gpu) {
+-			used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd);
+-			return sysfs_show_64bit_val(buffer, used_mem);
+-		}
+-		/* TODO: Report APU/CPU-allocated memory; For now return 0 */
+-		return 0;
+-	}
+-
+-	mem = container_of(attr, struct kfd_mem_properties, attr_props);
++	mem = container_of(attr, struct kfd_mem_properties, attr);
+ 	sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type);
+ 	sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes);
+ 	sysfs_show_32bit_prop(buffer, "flags", mem->flags);
+@@ -318,7 +568,6 @@ static const struct sysfs_ops mem_ops = {
+ };
+ 
+ static struct kobj_type mem_type = {
+-	.release = kfd_topology_kobj_release,
+ 	.sysfs_ops = &mem_ops,
+ };
+ 
+@@ -326,7 +575,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
+ 		char *buffer)
+ {
+ 	ssize_t ret;
+-	uint32_t i, j;
++	uint32_t i;
+ 	struct kfd_cache_properties *cache;
+ 
+ 	/* Making sure that the buffer is an empty string */
+@@ -344,18 +593,12 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
+ 	sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
+ 	sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
+ 	snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
+-	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+-		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
+-			/* Check each bit */
+-			if (cache->sibling_map[i] & (1 << j))
+-				ret = snprintf(buffer, PAGE_SIZE,
+-					 "%s%d%s", buffer, 1, ",");
+-			else
+-				ret = snprintf(buffer, PAGE_SIZE,
+-					 "%s%d%s", buffer, 0, ",");
+-		}
+-	/* Replace the last "," with end of line */
+-	*(buffer + strlen(buffer) - 1) = 0xA;
++	for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++)
++		ret = snprintf(buffer, PAGE_SIZE, "%s%d%s",
++				buffer, cache->sibling_map[i],
++				(i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ?
++						"\n" : ",");
++
+ 	return ret;
+ }
+ 
+@@ -364,43 +607,9 @@ static const struct sysfs_ops cache_ops = {
+ };
+ 
+ static struct kobj_type cache_type = {
+-	.release = kfd_topology_kobj_release,
+ 	.sysfs_ops = &cache_ops,
+ };
+ 
+-/****** Sysfs of Performance Counters ******/
+-
+-struct kfd_perf_attr {
+-	struct kobj_attribute attr;
+-	uint32_t data;
+-};
+-
+-static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs,
+-			char *buf)
+-{
+-	struct kfd_perf_attr *attr;
+-
+-	buf[0] = 0;
+-	attr = container_of(attrs, struct kfd_perf_attr, attr);
+-	if (!attr->data) /* invalid data for PMC */
+-		return 0;
+-	else
+-		return sysfs_show_32bit_val(buf, attr->data);
+-}
+-
+-#define KFD_PERF_DESC(_name, _data)			\
+-{							\
+-	.attr  = __ATTR(_name, 0444, perf_show, NULL),	\
+-	.data = _data,					\
+-}
+-
+-static struct kfd_perf_attr perf_attr_iommu[] = {
+-	KFD_PERF_DESC(max_concurrent, 0),
+-	KFD_PERF_DESC(num_counters, 0),
+-	KFD_PERF_DESC(counter_ids, 0),
+-};
+-/****************************************/
+-
+ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 		char *buffer)
+ {
+@@ -408,7 +617,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 	char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+ 	uint32_t i;
+ 	uint32_t log_max_watch_addr;
+-	struct kfd_local_mem_info local_mem_info;
+ 
+ 	/* Making sure that the buffer is an empty string */
+ 	buffer[0] = 0;
+@@ -438,8 +646,18 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 			dev->node_props.cpu_cores_count);
+ 	sysfs_show_32bit_prop(buffer, "simd_count",
+ 			dev->node_props.simd_count);
+-	sysfs_show_32bit_prop(buffer, "mem_banks_count",
+-			dev->node_props.mem_banks_count);
++
++	if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
++		pr_info_once("mem_banks_count truncated from %d to %d\n",
++				dev->node_props.mem_banks_count,
++				dev->mem_bank_count);
++		sysfs_show_32bit_prop(buffer, "mem_banks_count",
++				dev->mem_bank_count);
++	} else {
++		sysfs_show_32bit_prop(buffer, "mem_banks_count",
++				dev->node_props.mem_banks_count);
++	}
++
+ 	sysfs_show_32bit_prop(buffer, "caches_count",
+ 			dev->node_props.caches_count);
+ 	sysfs_show_32bit_prop(buffer, "io_links_count",
+@@ -472,8 +690,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 			dev->node_props.device_id);
+ 	sysfs_show_32bit_prop(buffer, "location_id",
+ 			dev->node_props.location_id);
+-	sysfs_show_32bit_prop(buffer, "drm_render_minor",
+-			dev->node_props.drm_render_minor);
+ 
+ 	if (dev->gpu) {
+ 		log_max_watch_addr =
+@@ -489,29 +705,17 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
+ 				HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
+ 		}
+ 
+-		if (dev->gpu->device_info->asic_family == CHIP_TONGA)
+-			dev->node_props.capability |=
+-					HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
+-
+ 		sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
+-			dev->node_props.max_engine_clk_fcompute);
++			dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(
++					dev->gpu->kgd));
+ 
+-		/*
+-		 * If the ASIC is APU except Kaveri, set local memory size
+-		 * to 0 to disable local memory support
+-		 */
+-		if (!dev->gpu->device_info->is_need_iommu_device
+-			|| dev->gpu->device_info->asic_family == CHIP_KAVERI) {
+-			dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+-				&local_mem_info);
+-			sysfs_show_64bit_prop(buffer, "local_mem_size",
+-					local_mem_info.local_mem_size_private +
+-					local_mem_info.local_mem_size_public);
+-		} else
+-			sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL);
++		sysfs_show_64bit_prop(buffer, "local_mem_size",
++				(unsigned long long int) 0);
+ 
+ 		sysfs_show_32bit_prop(buffer, "fw_version",
+-				dev->gpu->mec_fw_version);
++			dev->gpu->kfd2kgd->get_fw_version(
++						dev->gpu->kgd,
++						KGD_ENGINE_MEC1));
+ 		sysfs_show_32bit_prop(buffer, "capability",
+ 				dev->node_props.capability);
+ 	}
+@@ -525,7 +729,6 @@ static const struct sysfs_ops node_ops = {
+ };
+ 
+ static struct kobj_type node_type = {
+-	.release = kfd_topology_kobj_release,
+ 	.sysfs_ops = &node_ops,
+ };
+ 
+@@ -541,7 +744,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ 	struct kfd_iolink_properties *iolink;
+ 	struct kfd_cache_properties *cache;
+ 	struct kfd_mem_properties *mem;
+-	struct kfd_perf_properties *perf;
+ 
+ 	if (dev->kobj_iolink) {
+ 		list_for_each_entry(iolink, &dev->io_link_props, list)
+@@ -570,12 +772,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ 	if (dev->kobj_mem) {
+ 		list_for_each_entry(mem, &dev->mem_props, list)
+ 			if (mem->kobj) {
+-				/* TODO: Remove when CPU/APU supported */
+-				if (dev->node_props.cpu_cores_count == 0)
+-					sysfs_remove_file(mem->kobj,
+-							&mem->attr_used);
+-				kfd_remove_sysfs_file(mem->kobj,
+-						&mem->attr_props);
++				kfd_remove_sysfs_file(mem->kobj, &mem->attr);
+ 				mem->kobj = NULL;
+ 			}
+ 		kobject_del(dev->kobj_mem);
+@@ -583,16 +780,6 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
+ 		dev->kobj_mem = NULL;
+ 	}
+ 
+-	if (dev->kobj_perf) {
+-		list_for_each_entry(perf, &dev->perf_props, list) {
+-			kfree(perf->attr_group);
+-			perf->attr_group = NULL;
+-		}
+-		kobject_del(dev->kobj_perf);
+-		kobject_put(dev->kobj_perf);
+-		dev->kobj_perf = NULL;
+-	}
+-
+ 	if (dev->kobj_node) {
+ 		sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
+ 		sysfs_remove_file(dev->kobj_node, &dev->attr_name);
+@@ -609,9 +796,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 	struct kfd_iolink_properties *iolink;
+ 	struct kfd_cache_properties *cache;
+ 	struct kfd_mem_properties *mem;
+-	struct kfd_perf_properties *perf;
+-	uint32_t num_attrs;
+-	struct attribute **attrs;
+ 	int ret;
+ 	uint32_t i;
+ 
+@@ -642,10 +826,6 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 	if (!dev->kobj_iolink)
+ 		return -ENOMEM;
+ 
+-	dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
+-	if (!dev->kobj_perf)
+-		return -ENOMEM;
+-
+ 	/*
+ 	 * Creating sysfs files for node properties
+ 	 */
+@@ -678,23 +858,12 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 		if (ret < 0)
+ 			return ret;
+ 
+-		mem->attr_props.name = "properties";
+-		mem->attr_props.mode = KFD_SYSFS_FILE_MODE;
+-		sysfs_attr_init(&mem->attr_props);
+-		ret = sysfs_create_file(mem->kobj, &mem->attr_props);
++		mem->attr.name = "properties";
++		mem->attr.mode = KFD_SYSFS_FILE_MODE;
++		sysfs_attr_init(&mem->attr);
++		ret = sysfs_create_file(mem->kobj, &mem->attr);
+ 		if (ret < 0)
+ 			return ret;
+-
+-		/* TODO: Support APU/CPU memory usage */
+-		if (dev->node_props.cpu_cores_count == 0) {
+-			mem->attr_used.name = "used_memory";
+-			mem->attr_used.mode = KFD_SYSFS_FILE_MODE;
+-			sysfs_attr_init(&mem->attr_used);
+-			ret = sysfs_create_file(mem->kobj, &mem->attr_used);
+-			if (ret < 0)
+-				return ret;
+-		}
+-
+ 		i++;
+ 	}
+ 
+@@ -734,38 +903,11 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
+ 		if (ret < 0)
+ 			return ret;
+ 		i++;
+-	}
+-
+-	/* All hardware blocks have the same number of attributes. */
+-	num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
+-	list_for_each_entry(perf, &dev->perf_props, list) {
+-		perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr)
+-			* num_attrs + sizeof(struct attribute_group),
+-			GFP_KERNEL);
+-		if (!perf->attr_group)
+-			return -ENOMEM;
+-
+-		attrs = (struct attribute **)(perf->attr_group + 1);
+-		if (!strcmp(perf->block_name, "iommu")) {
+-		/* Information of IOMMU's num_counters and counter_ids is shown
+-		 * under /sys/bus/event_source/devices/amd_iommu. We don't
+-		 * duplicate here.
+-		 */
+-			perf_attr_iommu[0].data = perf->max_concurrent;
+-			for (i = 0; i < num_attrs; i++)
+-				attrs[i] = &perf_attr_iommu[i].attr.attr;
+-		}
+-		perf->attr_group->name = perf->block_name;
+-		perf->attr_group->attrs = attrs;
+-		ret = sysfs_create_group(dev->kobj_perf, perf->attr_group);
+-		if (ret < 0)
+-			return ret;
+-	}
++}
+ 
+ 	return 0;
+ }
+ 
+-/* Called with write topology lock acquired */
+ static int kfd_build_sysfs_node_tree(void)
+ {
+ 	struct kfd_topology_device *dev;
+@@ -782,7 +924,6 @@ static int kfd_build_sysfs_node_tree(void)
+ 	return 0;
+ }
+ 
+-/* Called with write topology lock acquired */
+ static void kfd_remove_sysfs_node_tree(void)
+ {
+ 	struct kfd_topology_device *dev;
+@@ -854,251 +995,75 @@ static void kfd_topology_release_sysfs(void)
+ 	}
+ }
+ 
+-/* Called with write topology_lock acquired */
+-static void kfd_topology_update_device_list(struct list_head *temp_list,
+-					struct list_head *master_list)
+-{
+-	while (!list_empty(temp_list)) {
+-		list_move_tail(temp_list->next, master_list);
+-		sys_props.num_devices++;
+-	}
+-}
+-
+-static void kfd_debug_print_topology(void)
+-{
+-	struct kfd_topology_device *dev;
+-
+-	down_read(&topology_lock);
+-
+-	dev = list_last_entry(&topology_device_list,
+-			struct kfd_topology_device, list);
+-	if (dev) {
+-		if (dev->node_props.cpu_cores_count &&
+-				dev->node_props.simd_count) {
+-			pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
+-				dev->node_props.device_id,
+-				dev->node_props.vendor_id);
+-		} else if (dev->node_props.cpu_cores_count)
+-			pr_info("Topology: Add CPU node\n");
+-		else if (dev->node_props.simd_count)
+-			pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
+-				dev->node_props.device_id,
+-				dev->node_props.vendor_id);
+-	}
+-	up_read(&topology_lock);
+-}
+-
+-/* Helper function for intializing platform_xx members of kfd_system_properties
+- */
+-static void kfd_update_system_properties(void)
+-{
+-	struct kfd_topology_device *dev;
+-
+-	down_read(&topology_lock);
+-	dev = list_last_entry(&topology_device_list,
+-			struct kfd_topology_device, list);
+-	if (dev) {
+-		sys_props.platform_id =
+-			(*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
+-		sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
+-		sys_props.platform_rev = dev->oem_revision;
+-	}
+-	up_read(&topology_lock);
+-}
+-
+-static void find_system_memory(const struct dmi_header *dm,
+-	void *private)
+-{
+-	struct kfd_mem_properties *mem;
+-	u16 mem_width, mem_clock;
+-	struct kfd_topology_device *kdev =
+-		(struct kfd_topology_device *)private;
+-	const u8 *dmi_data = (const u8 *)(dm + 1);
+-
+-	if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
+-		mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
+-		mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
+-		list_for_each_entry(mem, &kdev->mem_props, list) {
+-			if (mem_width != 0xFFFF && mem_width != 0)
+-				mem->width = mem_width;
+-			if (mem_clock != 0)
+-				mem->mem_clk_max = mem_clock;
+-		}
+-	}
+-}
+-
+-/*
+- * Performance counters information is not part of CRAT but we would like to
+- * put them in the sysfs under topology directory for Thunk to get the data.
+- * This function is called before updating the sysfs.
+- */
+-static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
+-{
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-	struct kfd_perf_properties *props;
+-
+-	if (amd_iommu_pc_supported()) {
+-		props = kfd_alloc_struct(props);
+-		if (!props)
+-			return -ENOMEM;
+-		strcpy(props->block_name, "iommu");
+-		props->max_concurrent = amd_iommu_pc_get_max_banks(0) *
+-			amd_iommu_pc_get_max_counters(0); /* assume one iommu */
+-		list_add_tail(&props->list, &kdev->perf_props);
+-	}
+-#endif
+-
+-	return 0;
+-}
+-
+-/* kfd_add_non_crat_information - Add information that is not currently
+- *	defined in CRAT but is necessary for KFD topology
+- * @dev - topology device to which addition info is added
+- */
+-static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
+-{
+-	/* Check if CPU only node. */
+-	if (!kdev->gpu) {
+-		/* Add system memory information */
+-		dmi_walk(find_system_memory, kdev);
+-	}
+-	/* TODO: For GPU node, rearrange code from kfd_topology_add_device */
+-}
+-
+-#ifdef CONFIG_ACPI
+-/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices.
+- *	Ignore CRAT for all other devices. AMD APU is identified if both CPU
+- *	and GPU cores are present.
+- * @device_list - topology device list created by parsing ACPI CRAT table.
+- * @return - TRUE if invalid, FALSE is valid.
+- */
+-static bool kfd_is_acpi_crat_invalid(struct list_head *device_list)
+-{
+-	struct kfd_topology_device *dev;
+-
+-	list_for_each_entry(dev, device_list, list) {
+-		if (dev->node_props.cpu_cores_count &&
+-			dev->node_props.simd_count)
+-			return false;
+-	}
+-	pr_info("Ignoring ACPI CRAT on non-APU system\n");
+-	return true;
+-}
+-#endif
+-
+ int kfd_topology_init(void)
+ {
+ 	void *crat_image = NULL;
+ 	size_t image_size = 0;
+ 	int ret;
+-	struct list_head temp_topology_device_list;
+-	int cpu_only_node = 0;
+-	struct kfd_topology_device *kdev;
+-	int proximity_domain;
+-
+-	/* topology_device_list - Master list of all topology devices
+-	 * temp_topology_device_list - temporary list created while parsing CRAT
+-	 * or VCRAT. Once parsing is complete the contents of list is moved to
+-	 * topology_device_list
+-	 */
+ 
+-	/* Initialize the head for the both the lists */
++	/*
++	 * Initialize the head for the topology device list
++	 */
+ 	INIT_LIST_HEAD(&topology_device_list);
+-	INIT_LIST_HEAD(&temp_topology_device_list);
+ 	init_rwsem(&topology_lock);
++	topology_crat_parsed = 0;
+ 
+ 	memset(&sys_props, 0, sizeof(sys_props));
+ 
+-	/* Proximity domains in ACPI CRAT tables start counting at
+-	 * 0. The same should be true for virtual CRAT tables created
+-	 * at this stage. GPUs added later in kfd_topology_add_device
+-	 * use a counter.
+-	 */
+-	proximity_domain = 0;
+-
+ 	/*
+-	 * Get the CRAT image from the ACPI. If ACPI doesn't have one
+-	 * or if ACPI CRAT is invalid create a virtual CRAT.
+-	 * NOTE: The current implementation expects all AMD APUs to have
+-	 *	CRAT. If no CRAT is available, it is assumed to be a CPU
++	 * Get the CRAT image from the ACPI
+ 	 */
+-#ifdef CONFIG_ACPI
+-	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
+-	if (ret == 0) {
+-		ret = kfd_parse_crat_table(crat_image,
+-					   &temp_topology_device_list,
+-					   proximity_domain);
+-		if (ret ||
+-			kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
+-
+-			kfd_release_topology_device_list(
+-				&temp_topology_device_list);
+-			kfd_destroy_crat_image(crat_image);
+-			crat_image = NULL;
+-		}
+-	}
+-#endif
+-	if (!crat_image) {
+-		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
+-				COMPUTE_UNIT_CPU, NULL,
+-				proximity_domain);
+-		cpu_only_node = 1;
+-		if (ret) {
+-			pr_err("Error creating VCRAT table for CPU\n");
+-			return ret;
+-		}
+-
+-		ret = kfd_parse_crat_table(crat_image,
+-				&temp_topology_device_list,
+-				proximity_domain);
+-		if (ret) {
+-			pr_err("Error parsing VCRAT table for CPU\n");
++	ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
++	if (ret == 0 && image_size > 0) {
++		pr_info("Found CRAT image with size=%zd\n", image_size);
++		crat_image = kmalloc(image_size, GFP_KERNEL);
++		if (!crat_image) {
++			ret = -ENOMEM;
++			pr_err("No memory for allocating CRAT image\n");
+ 			goto err;
+ 		}
+-	}
+-
+-	kdev = list_first_entry(&temp_topology_device_list,
+-				struct kfd_topology_device, list);
+-	kfd_add_perf_to_topology(kdev);
+-
+-	down_write(&topology_lock);
+-	kfd_topology_update_device_list(&temp_topology_device_list,
+-					    &topology_device_list);
+-	atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
+-	ret = kfd_topology_update_sysfs();
+-	up_write(&topology_lock);
+-
+-	if (ret == 0) {
+-		sys_props.generation_count++;
+-		kfd_update_system_properties();
+-		kfd_debug_print_topology();
+-		pr_info("Finished initializing topology\n");
+-	} else
+-		pr_err("Failed to update topology in sysfs ret=%d\n", ret);
+-
+-	/* For nodes with GPU, this information gets added
+-	 * when GPU is detected (kfd_topology_add_device).
+-	 */
+-	if (cpu_only_node) {
+-		/* Add additional information to CPU only node created above */
+-		down_write(&topology_lock);
+-		kdev = list_first_entry(&topology_device_list,
+-				struct kfd_topology_device, list);
+-		up_write(&topology_lock);
+-		kfd_add_non_crat_information(kdev);
++		ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
++
++		if (ret == 0) {
++			down_write(&topology_lock);
++			ret = kfd_parse_crat_table(crat_image);
++			if (ret == 0)
++				ret = kfd_topology_update_sysfs();
++			up_write(&topology_lock);
++		} else {
++			pr_err("Couldn't get CRAT table size from ACPI\n");
++		}
++		kfree(crat_image);
++	} else if (ret == -ENODATA) {
++		ret = 0;
++	} else {
++		pr_err("Couldn't get CRAT table size from ACPI\n");
+ 	}
+ 
+ err:
+-	kfd_destroy_crat_image(crat_image);
++	pr_info("Finished initializing topology ret=%d\n", ret);
+ 	return ret;
+ }
+ 
+ void kfd_topology_shutdown(void)
+ {
+-	down_write(&topology_lock);
+ 	kfd_topology_release_sysfs();
+ 	kfd_release_live_view();
+-	up_write(&topology_lock);
++}
++
++static void kfd_debug_print_topology(void)
++{
++	struct kfd_topology_device *dev;
++	uint32_t i = 0;
++
++	pr_info("DEBUG PRINT OF TOPOLOGY:");
++	list_for_each_entry(dev, &topology_device_list, list) {
++		pr_info("Node: %d\n", i);
++		pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no"));
++		pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count);
++		pr_info("\tSIMD count: %d", dev->node_props.simd_count);
++		i++;
++	}
+ }
+ 
+ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+@@ -1107,15 +1072,11 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+ 	uint32_t buf[7];
+ 	uint64_t local_mem_size;
+ 	int i;
+-	struct kfd_local_mem_info local_mem_info;
+ 
+ 	if (!gpu)
+ 		return 0;
+ 
+-	gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
+-
+-	local_mem_size = local_mem_info.local_mem_size_private +
+-			local_mem_info.local_mem_size_public;
++	local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd);
+ 
+ 	buf[0] = gpu->pdev->devfn;
+ 	buf[1] = gpu->pdev->subsystem_vendor;
+@@ -1130,32 +1091,18 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
+ 
+ 	return hashout;
+ }
+-/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
+- *		the GPU device is not already present in the topology device
+- *		list then return NULL. This means a new topology device has to
+- *		be created for this GPU.
+- * TODO: Rather than assiging @gpu to first topology device withtout
+- *		gpu attached, it will better to have more stringent check.
+- */
++
+ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
+ {
+ 	struct kfd_topology_device *dev;
+ 	struct kfd_topology_device *out_dev = NULL;
+-	struct kfd_mem_properties *mem;
+ 
+-	down_write(&topology_lock);
+ 	list_for_each_entry(dev, &topology_device_list, list)
+ 		if (!dev->gpu && (dev->node_props.simd_count > 0)) {
+ 			dev->gpu = gpu;
+ 			out_dev = dev;
+-
+-			/* Assign mem->gpu */
+-			list_for_each_entry(mem, &dev->mem_props, list)
+-				mem->gpu = dev->gpu;
+-
+ 			break;
+ 		}
+-	up_write(&topology_lock);
+ 
+ 	return out_dev;
+ }
+@@ -1168,204 +1115,84 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
+ 	 */
+ }
+ 
+-/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
+- *		patch this after CRAT parsing.
+- */
+-static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
+-{
+-	struct kfd_mem_properties *mem;
+-	struct kfd_local_mem_info local_mem_info;
+-
+-	if (!dev)
+-		return;
+-
+-	/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
+-	 * single bank of VRAM local memory.
+-	 * for dGPUs - VCRAT reports only one bank of Local Memory
+-	 * for APUs - If CRAT from ACPI reports more than one bank, then
+-	 *	all the banks will report the same mem_clk_max information
+-	 */
+-	dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+-		&local_mem_info);
+-
+-	list_for_each_entry(mem, &dev->mem_props, list)
+-		mem->mem_clk_max = local_mem_info.mem_clk_max;
+-}
+-
+-static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
+-{
+-	struct kfd_iolink_properties *link;
+-
+-	if (!dev || !dev->gpu)
+-		return;
+-
+-	/* GPU only creates direck links so apply flags setting to all */
+-	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
+-		list_for_each_entry(link, &dev->io_link_props, list)
+-			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
+-				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+-				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+-}
+-
+ int kfd_topology_add_device(struct kfd_dev *gpu)
+ {
+ 	uint32_t gpu_id;
+ 	struct kfd_topology_device *dev;
+-	struct kfd_cu_info cu_info;
+-	int res = 0;
+-	struct list_head temp_topology_device_list;
+-	void *crat_image = NULL;
+-	size_t image_size = 0;
+-	int proximity_domain;
+-
+-	INIT_LIST_HEAD(&temp_topology_device_list);
++	int res;
+ 
+ 	gpu_id = kfd_generate_gpu_id(gpu);
+ 
+ 	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
+ 
+-	proximity_domain = atomic_inc_return(&
+-				topology_crat_proximity_domain);
+-
+-	/* Check to see if this gpu device exists in the topology_device_list.
+-	 * If so, assign the gpu to that device,
+-	 * else create a Virtual CRAT for this gpu device and then parse that
+-	 * CRAT to create a new topology device. Once created assign the gpu to
+-	 * that topology device
++	down_write(&topology_lock);
++	/*
++	 * Try to assign the GPU to existing topology device (generated from
++	 * CRAT table
+ 	 */
+ 	dev = kfd_assign_gpu(gpu);
+ 	if (!dev) {
+-		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+-				COMPUTE_UNIT_GPU,
+-				gpu, proximity_domain);
+-		if (res) {
+-			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+-			       gpu_id);
+-			return res;
+-		}
+-		res = kfd_parse_crat_table(crat_image,
+-				&temp_topology_device_list, proximity_domain);
+-		if (res) {
+-			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+-			       gpu_id);
++		pr_info("GPU was not found in the current topology. Extending.\n");
++		kfd_debug_print_topology();
++		dev = kfd_create_topology_device();
++		if (!dev) {
++			res = -ENOMEM;
+ 			goto err;
+ 		}
++		dev->gpu = gpu;
+ 
+-		down_write(&topology_lock);
+-		kfd_topology_update_device_list(&temp_topology_device_list,
+-			&topology_device_list);
++		/*
++		 * TODO: Make a call to retrieve topology information from the
++		 * GPU vBIOS
++		 */
+ 
+ 		/* Update the SYSFS tree, since we added another topology
+ 		 * device
+ 		 */
+-		res = kfd_topology_update_sysfs();
+-		up_write(&topology_lock);
+-
+-		if (res == 0)
+-			sys_props.generation_count++;
+-		else
+-			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+-						gpu_id, res);
+-		dev = kfd_assign_gpu(gpu);
+-		if (!dev) {
+-			pr_err("Could not assign GPU\n");
+-			res = -ENODEV;
+-			goto err;
+-		}
++		if (kfd_topology_update_sysfs() < 0)
++			kfd_topology_release_sysfs();
++
+ 	}
+ 
+ 	dev->gpu_id = gpu_id;
+ 	gpu->id = gpu_id;
+-
+-	/* TODO: Move the following lines to function
+-	 *	kfd_add_non_crat_information
+-	 */
+-
+-	/* Fill-in additional information that is not available in CRAT but
+-	 * needed for the topology
+-	 */
+-
+-	dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
+-	dev->node_props.simd_arrays_per_engine =
+-		cu_info.num_shader_arrays_per_engine;
+-
+ 	dev->node_props.vendor_id = gpu->pdev->vendor;
+ 	dev->node_props.device_id = gpu->pdev->device;
+-	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
+-		gpu->pdev->devfn);
+-	dev->node_props.max_engine_clk_fcompute =
+-		dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
+-	dev->node_props.max_engine_clk_ccompute =
+-		cpufreq_quick_get_max(0) / 1000;
+-	dev->node_props.drm_render_minor =
+-		gpu->shared_resources.drm_render_minor;
+-
+-	kfd_fill_mem_clk_max_info(dev);
+-	kfd_fill_iolink_non_crat_info(dev);
+-
+-	switch (dev->gpu->device_info->asic_family) {
+-	case CHIP_KAVERI:
+-	case CHIP_HAWAII:
+-	case CHIP_TONGA:
+-		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+-		break;
+-	case CHIP_CARRIZO:
+-	case CHIP_FIJI:
+-	case CHIP_POLARIS10:
+-	case CHIP_POLARIS11:
+-		pr_debug("Adding doorbell packet type capability\n");
+-		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+-		break;
+-	case CHIP_VEGA10:
+-	case CHIP_RAVEN:
+-		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+-			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+-		break;
+-	default:
+-		BUG();
+-	}
+-
+-	/* Fix errors in CZ CRAT.
+-	 * simd_count: Carrizo CRAT reports wrong simd_count, probably because
+-	 *		it doesn't consider masked out CUs
+-	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd.
+-	 * capability flag: Carrizo CRAT doesn't report IOMMU flags.
++	dev->node_props.location_id = (gpu->pdev->bus->number << 24) +
++			(gpu->pdev->devfn & 0xffffff);
++	/*
++	 * TODO: Retrieve max engine clock values from KGD
+ 	 */
++
+ 	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
+-		dev->node_props.simd_count =
+-			cu_info.simd_per_cu * cu_info.cu_active_number;
+-		dev->node_props.max_waves_per_simd = 10;
+-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
++		dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE;
++		pr_info("Adding doorbell packet type capability\n");
+ 	}
+ 
+-	kfd_debug_print_topology();
++	res = 0;
+ 
+-	if (!res)
+-		kfd_notify_gpu_change(gpu_id, 1);
+ err:
+-	kfd_destroy_crat_image(crat_image);
++	up_write(&topology_lock);
++
++	if (res == 0)
++		kfd_notify_gpu_change(gpu_id, 1);
++
+ 	return res;
+ }
+ 
+ int kfd_topology_remove_device(struct kfd_dev *gpu)
+ {
+-	struct kfd_topology_device *dev, *tmp;
++	struct kfd_topology_device *dev;
+ 	uint32_t gpu_id;
+ 	int res = -ENODEV;
+ 
+ 	down_write(&topology_lock);
+ 
+-	list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
++	list_for_each_entry(dev, &topology_device_list, list)
+ 		if (dev->gpu == gpu) {
+ 			gpu_id = dev->gpu_id;
+ 			kfd_remove_sysfs_node_entry(dev);
+ 			kfd_release_topology_device(dev);
+-			sys_props.num_devices--;
+ 			res = 0;
+ 			if (kfd_topology_update_sysfs() < 0)
+ 				kfd_topology_release_sysfs();
+@@ -1380,26 +1207,22 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
+ 	return res;
+ }
+ 
+-/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
+- *	topology. If GPU device is found @idx, then valid kfd_dev pointer is
+- *	returned through @kdev
+- * Return -	0: On success (@kdev will be NULL for non GPU nodes)
+- *		-1: If end of list
++/*
++ * When idx is out of bounds, the function will return NULL
+  */
+-int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
++struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
+ {
+ 
+ 	struct kfd_topology_device *top_dev;
++	struct kfd_dev *device = NULL;
+ 	uint8_t device_idx = 0;
+ 
+-	*kdev = NULL;
+ 	down_read(&topology_lock);
+ 
+ 	list_for_each_entry(top_dev, &topology_device_list, list) {
+ 		if (device_idx == idx) {
+-			*kdev = top_dev->gpu;
+-			up_read(&topology_lock);
+-			return 0;
++			device = top_dev->gpu;
++			break;
+ 		}
+ 
+ 		device_idx++;
+@@ -1407,89 +1230,6 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
+ 
+ 	up_read(&topology_lock);
+ 
+-	return -1;
+-
+-}
+-
+-static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
+-{
+-	int first_cpu_of_numa_node;
+-
+-	if (!cpumask || (cpumask == cpu_none_mask))
+-		return -1;
+-	first_cpu_of_numa_node = cpumask_first(cpumask);
+-	if (first_cpu_of_numa_node >= nr_cpu_ids)
+-		return -1;
+-#ifdef CONFIG_X86_64
+-	return cpu_data(first_cpu_of_numa_node).apicid;
+-#else
+-	return first_cpu_of_numa_node;
+-#endif
+-}
+-
+-/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
+- *	of the given NUMA node (numa_node_id)
+- * Return -1 on failure
+- */
+-int kfd_numa_node_to_apic_id(int numa_node_id)
+-{
+-	if (numa_node_id == -1) {
+-		pr_warn("Invalid NUMA Node. Use online CPU mask\n");
+-		return kfd_cpumask_to_apic_id(cpu_online_mask);
+-	}
+-	return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
+-}
+-
+-#if defined(CONFIG_DEBUG_FS)
+-
+-int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
+-{
+-	struct kfd_topology_device *dev;
+-	unsigned int i = 0;
+-	int r = 0;
+-
+-	down_read(&topology_lock);
+-
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		if (!dev->gpu) {
+-			i++;
+-			continue;
+-		}
+-
+-		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+-		r = device_queue_manager_debugfs_hqds(m, dev->gpu->dqm);
+-		if (r != 0)
+-			break;
+-	}
+-
+-	up_read(&topology_lock);
+-
+-	return r;
+-}
+-
+-int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
+-{
+-	struct kfd_topology_device *dev;
+-	unsigned int i = 0;
+-	int r = 0;
+-
+-	down_read(&topology_lock);
+-
+-	list_for_each_entry(dev, &topology_device_list, list) {
+-		if (!dev->gpu) {
+-			i++;
+-			continue;
+-		}
+-
+-		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+-		r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets);
+-		if (r != 0)
+-			break;
+-	}
+-
+-	up_read(&topology_lock);
++	return device;
+ 
+-	return r;
+ }
+-
+-#endif
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+index d1c9ba3..c3ddb9b 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+@@ -39,17 +39,8 @@
+ #define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
+ #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
+ #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
+-#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK	0x00003000
+-#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT	12
+-#define HSA_CAP_RESERVED			0xffffc000
+-
+-#define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
+-#define HSA_CAP_DOORBELL_TYPE_1_0		0x1
+-#define HSA_CAP_DOORBELL_TYPE_2_0		0x2
+-#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
+-#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
++#define HSA_CAP_RESERVED			0xfffff000
+ #define HSA_CAP_DOORBELL_PACKET_TYPE		0x00001000
+-#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
+ 
+ struct kfd_node_properties {
+ 	uint32_t cpu_cores_count;
+@@ -75,7 +66,6 @@ struct kfd_node_properties {
+ 	uint32_t location_id;
+ 	uint32_t max_engine_clk_fcompute;
+ 	uint32_t max_engine_clk_ccompute;
+-	int32_t  drm_render_minor;
+ 	uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+ };
+ 
+@@ -98,11 +88,11 @@ struct kfd_mem_properties {
+ 	uint32_t		width;
+ 	uint32_t		mem_clk_max;
+ 	struct kobject		*kobj;
+-	struct kfd_dev		*gpu;
+-	struct attribute	attr_props;
+-	struct attribute	attr_used;
++	struct attribute	attr;
+ };
+ 
++#define KFD_TOPOLOGY_CPU_SIBLINGS 256
++
+ #define HSA_CACHE_TYPE_DATA		0x00000001
+ #define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
+ #define HSA_CACHE_TYPE_CPU		0x00000004
+@@ -119,7 +109,7 @@ struct kfd_cache_properties {
+ 	uint32_t		cache_assoc;
+ 	uint32_t		cache_latency;
+ 	uint32_t		cache_type;
+-	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
++	uint8_t			sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
+ 	struct kobject		*kobj;
+ 	struct attribute	attr;
+ };
+@@ -142,36 +132,24 @@ struct kfd_iolink_properties {
+ 	struct attribute	attr;
+ };
+ 
+-struct kfd_perf_properties {
+-	struct list_head	list;
+-	char			block_name[16];
+-	uint32_t		max_concurrent;
+-	struct attribute_group	*attr_group;
+-};
+-
+ struct kfd_topology_device {
+ 	struct list_head		list;
+ 	uint32_t			gpu_id;
+-	uint32_t			proximity_domain;
+ 	struct kfd_node_properties	node_props;
++	uint32_t			mem_bank_count;
+ 	struct list_head		mem_props;
+ 	uint32_t			cache_count;
+ 	struct list_head		cache_props;
+ 	uint32_t			io_link_count;
+ 	struct list_head		io_link_props;
+-	struct list_head		perf_props;
+ 	struct kfd_dev			*gpu;
+ 	struct kobject			*kobj_node;
+ 	struct kobject			*kobj_mem;
+ 	struct kobject			*kobj_cache;
+ 	struct kobject			*kobj_iolink;
+-	struct kobject			*kobj_perf;
+ 	struct attribute		attr_gpuid;
+ 	struct attribute		attr_name;
+ 	struct attribute		attr_props;
+-	uint8_t		oem_id[CRAT_OEMID_LENGTH];
+-	uint8_t		oem_table_id[CRAT_OEMTABLEID_LENGTH];
+-	uint32_t	oem_revision;
+ };
+ 
+ struct kfd_system_properties {
+@@ -186,14 +164,6 @@ struct kfd_system_properties {
+ 	struct attribute	attr_props;
+ };
+ 
+-struct kfd_topology_device *kfd_create_topology_device(
+-		struct list_head *device_list);
+-void kfd_release_topology_device_list(struct list_head *device_list);
+ 
+-#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2)
+-extern bool amd_iommu_pc_supported(void);
+-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+-#endif
+ 
+ #endif /* __KFD_TOPOLOGY_H__ */
+diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h
+deleted file mode 100644
+index e00d03d..0000000
+--- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h
++++ /dev/null
+@@ -1,84 +0,0 @@
+-/*
+- * Copyright 2016 Advanced Micro Devices, Inc.
+- *
+- * Permission is hereby granted, free of charge, to any person obtaining a
+- * copy of this software and associated documentation files (the "Software"),
+- * to deal in the Software without restriction, including without limitation
+- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+- * and/or sell copies of the Software, and to permit persons to whom the
+- * Software is furnished to do so, subject to the following conditions:
+- *
+- * The above copyright notice and this permission notice shall be included in
+- * all copies or substantial portions of the Software.
+- *
+- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+- * OTHER DEALINGS IN THE SOFTWARE.
+- */
+-
+-#ifndef HSA_SOC15_INT_H_INCLUDED
+-#define HSA_SOC15_INT_H_INCLUDED
+-/*
+- * vega10+ IH clients
+- */
+-enum soc15_ih_client_id {
+-	SOC15_IH_CLIENTID_IH	    = 0x00,
+-	SOC15_IH_CLIENTID_ACP	    = 0x01,
+-	SOC15_IH_CLIENTID_ATHUB	    = 0x02,
+-	SOC15_IH_CLIENTID_BIF	    = 0x03,
+-	SOC15_IH_CLIENTID_DCE	    = 0x04,
+-	SOC15_IH_CLIENTID_ISP	    = 0x05,
+-	SOC15_IH_CLIENTID_PCIE0	    = 0x06,
+-	SOC15_IH_CLIENTID_RLC	    = 0x07,
+-	SOC15_IH_CLIENTID_SDMA0	    = 0x08,
+-	SOC15_IH_CLIENTID_SDMA1	    = 0x09,
+-	SOC15_IH_CLIENTID_SE0SH	    = 0x0a,
+-	SOC15_IH_CLIENTID_SE1SH	    = 0x0b,
+-	SOC15_IH_CLIENTID_SE2SH	    = 0x0c,
+-	SOC15_IH_CLIENTID_SE3SH	    = 0x0d,
+-	SOC15_IH_CLIENTID_SYSHUB    = 0x0e,
+-	SOC15_IH_CLIENTID_THM	    = 0x0f,
+-	SOC15_IH_CLIENTID_UVD	    = 0x10,
+-	SOC15_IH_CLIENTID_VCE0	    = 0x11,
+-	SOC15_IH_CLIENTID_VMC	    = 0x12,
+-	SOC15_IH_CLIENTID_XDMA	    = 0x13,
+-	SOC15_IH_CLIENTID_GRBM_CP   = 0x14,
+-	SOC15_IH_CLIENTID_ATS	    = 0x15,
+-	SOC15_IH_CLIENTID_ROM_SMUIO = 0x16,
+-	SOC15_IH_CLIENTID_DF	    = 0x17,
+-	SOC15_IH_CLIENTID_VCE1	    = 0x18,
+-	SOC15_IH_CLIENTID_PWR	    = 0x19,
+-	SOC15_IH_CLIENTID_UTCL2	    = 0x1b,
+-	SOC15_IH_CLIENTID_EA	    = 0x1c,
+-	SOC15_IH_CLIENTID_UTCL2LOG  = 0x1d,
+-	SOC15_IH_CLIENTID_MP0	    = 0x1e,
+-	SOC15_IH_CLIENTID_MP1	    = 0x1f,
+-
+-	SOC15_IH_CLIENTID_MAX
+-};
+-
+-
+-#define SOC15_INTSRC_CP_END_OF_PIPE	181
+-#define SOC15_INTSRC_CP_BAD_OPCODE	183
+-#define SOC15_INTSRC_SQ_INTERRUPT_MSG	239
+-#define SOC15_INTSRC_VMC_FAULT		0
+-#define SOC15_INTSRC_SDMA_TRAP		224
+-
+-
+-#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
+-#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff)
+-#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff)
+-#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf)
+-#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1)
+-#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff)
+-#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4]))
+-#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5]))
+-#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6]))
+-#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7]))
+-
+-#endif
+-
+-- 
+2.7.4
+