diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3437-drm-amdkfd-Copy-in-KFD-related-files.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3437-drm-amdkfd-Copy-in-KFD-related-files.patch | 14204 |
1 files changed, 14204 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3437-drm-amdkfd-Copy-in-KFD-related-files.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3437-drm-amdkfd-Copy-in-KFD-related-files.patch new file mode 100644 index 00000000..4a29c360 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3437-drm-amdkfd-Copy-in-KFD-related-files.patch @@ -0,0 +1,14204 @@ +From 633327b772ce8145b588cda81e6e3c39ec118f85 Mon Sep 17 00:00:00 2001 +From: Kent Russell <kent.russell@amd.com> +Date: Mon, 15 Jan 2018 06:51:28 -0500 +Subject: [PATCH 3437/4131] drm/amdkfd: Copy in KFD-related files + +This includes kernel configs, README.md, includes, and amdgpu/*amdkfd* +files + +Change-Id: Ie15c0428e9e1de3e93750679d9dec4b7033c9cb2 +Signed-off-by: Kent Russell <kent.russell@amd.com> +--- + drivers/gpu/drm/amd/amdkfd/Kconfig | 4 +- + drivers/gpu/drm/amd/amdkfd/Makefile | 24 +- + drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 72 +- + drivers/gpu/drm/amd/amdkfd/cik_int.h | 27 +- + drivers/gpu/drm/amd/amdkfd/cik_regs.h | 3 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | 10 +- + .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 1411 ++++++++++++++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1118 +++++++++++++++- + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 166 ++- + drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 50 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c | 109 +- + drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h | 32 + + drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c | 3 +- + drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 2 +- + drivers/gpu/drm/amd/amdkfd/kfd_device.c | 529 ++++++-- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 517 ++++++- + .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 37 +- + .../drm/amd/amdkfd/kfd_device_queue_manager_cik.c | 56 + + .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 84 ++ + .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 98 ++ + drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 68 +- + drivers/gpu/drm/amd/amdkfd/kfd_events.c | 150 ++- + drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 122 +- + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 135 ++ + drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c | 6 +- + drivers/gpu/drm/amd/amdkfd/kfd_iommu.c | 356 +++++ + drivers/gpu/drm/amd/amdkfd/kfd_iommu.h | 78 ++ + drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 270 ++++ + drivers/gpu/drm/amd/amdkfd/kfd_ipc.h | 51 + + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 71 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 17 +- + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 128 ++ + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 377 ++++++ + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 361 +++++ + drivers/gpu/drm/amd/amdkfd/kfd_module.c | 43 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 52 + + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 11 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 126 +- + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 523 ++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 133 +- + drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 377 ++---- + drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 515 +++++++ + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h | 583 ++++++++ + drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h | 97 -- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 340 ++++- + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 783 +++++++++-- + .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 75 +- + drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 296 ++++ + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 185 ++- + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 21 +- + drivers/gpu/drm/amd/amdkfd/soc15_int.h | 84 ++ + 51 files changed, 9631 insertions(+), 1155 deletions(-) + create mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_iommu.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_iommu.h + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_ipc.h + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h + create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_rdma.c + create mode 100644 drivers/gpu/drm/amd/amdkfd/soc15_int.h + +diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig +index bc5a294..50b8b56 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Kconfig ++++ b/drivers/gpu/drm/amd/amdkfd/Kconfig +@@ -4,6 +4,8 @@ + + config HSA_AMD + tristate "HSA kernel driver for AMD GPU devices" +- depends on DRM_AMDGPU && AMD_IOMMU_V2 && X86_64 ++ depends on (DRM_RADEON || DRM_AMDGPU) && (X86_64 || PPC64 || ARM64) ++ select DRM_AMDGPU_USERPTR ++ imply AMD_IOMMU_V2 + help + Enable this if you want to use HSA features on AMD GPU devices. +diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile +index a317e76..b65537a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/Makefile ++++ b/drivers/gpu/drm/amd/amdkfd/Makefile +@@ -23,19 +23,29 @@ + # Makefile for Heterogenous System Architecture support for AMD GPU devices + # + +-ccflags-y := -Idrivers/gpu/drm/amd/include/ \ +- -Idrivers/gpu/drm/amd/include/asic_reg ++FULL_AMD_PATH=$(src)/.. ++ ++ccflags-y := -Iinclude/drm \ ++ -I$(FULL_AMD_PATH)/include/ \ ++ -I$(FULL_AMD_PATH)/include/asic_reg + + amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ + kfd_pasid.o kfd_doorbell.o kfd_flat_memory.o \ + kfd_process.o kfd_queue.o kfd_mqd_manager.o \ + kfd_mqd_manager_cik.o kfd_mqd_manager_vi.o \ ++ kfd_mqd_manager_v9.o \ + kfd_kernel_queue.o kfd_kernel_queue_cik.o \ +- kfd_kernel_queue_vi.o kfd_packet_manager.o \ +- kfd_process_queue_manager.o kfd_device_queue_manager.o \ +- kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \ +- kfd_interrupt.o kfd_events.o cik_event_interrupt.o \ +- kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o ++ kfd_kernel_queue_vi.o kfd_kernel_queue_v9.o \ ++ kfd_packet_manager.o kfd_process_queue_manager.o \ ++ kfd_device_queue_manager.o kfd_device_queue_manager_cik.o \ ++ kfd_device_queue_manager_vi.o kfd_device_queue_manager_v9.o \ ++ kfd_interrupt.o kfd_events.o cik_event_interrupt.o kfd_int_process_v9.o \ ++ kfd_dbgdev.o kfd_dbgmgr.o kfd_flat_memory.o kfd_crat.o kfd_rdma.o \ ++ kfd_peerdirect.o kfd_ipc.o ++ ++ifneq ($(CONFIG_AMD_IOMMU_V2),) ++amdkfd-y += kfd_iommu.o ++endif + + amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o + +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +index 3d5ccb3..751c004 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +@@ -24,44 +24,90 @@ + #include "kfd_events.h" + #include "cik_int.h" + +-static bool cik_event_interrupt_isr(struct kfd_dev *dev, ++static bool is_cpc_vm_fault(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) + { +- unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + +- pasid = (ihre->ring_id & 0xffff0000) >> 16; ++ if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ++ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && ++ ihre->vmid >= dev->vm_info.first_vmid_kfd && ++ ihre->vmid <= dev->vm_info.last_vmid_kfd) ++ return true; ++ return false; ++} ++ ++static bool cik_event_interrupt_isr(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry, ++ uint32_t *patched_ihre, ++ bool *patched_flag) ++{ ++ const struct cik_ih_ring_entry *ihre = ++ (const struct cik_ih_ring_entry *)ih_ring_entry; ++ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; ++ struct cik_ih_ring_entry *tmp_ihre = ++ (struct cik_ih_ring_entry *) patched_ihre; ++ ++ /* This workaround is due to HW/FW limitation on Hawaii that ++ * VMID and PASID are not written into ih_ring_entry ++ */ ++ if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ++ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && ++ dev->device_info->asic_family == CHIP_HAWAII) { ++ *patched_flag = true; ++ *tmp_ihre = *ihre; + ++ tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); ++ tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( ++ dev->kgd, tmp_ihre->vmid); ++ return (tmp_ihre->pasid != 0) && ++ tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && ++ tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; ++ } + /* Do not process in ISR, just request it to be forwarded to WQ. */ +- return (pasid != 0) && ++ return (ihre->pasid != 0) && + (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || + ihre->source_id == CIK_INTSRC_SDMA_TRAP || + ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || +- ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE); ++ ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || ++ is_cpc_vm_fault(dev, ih_ring_entry)); + } + + static void cik_event_interrupt_wq(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) + { +- unsigned int pasid; + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + uint32_t context_id = ihre->data & 0xfffffff; + +- pasid = (ihre->ring_id & 0xffff0000) >> 16; +- +- if (pasid == 0) ++ if (ihre->pasid == 0) + return; + + if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) +- kfd_signal_event_interrupt(pasid, context_id, 28); ++ kfd_signal_event_interrupt(ihre->pasid, context_id, 28); + else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) +- kfd_signal_event_interrupt(pasid, context_id, 28); ++ kfd_signal_event_interrupt(ihre->pasid, context_id, 28); + else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) +- kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); ++ kfd_signal_event_interrupt(ihre->pasid, context_id & 0xff, 8); + else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) +- kfd_signal_hw_exception_event(pasid); ++ kfd_signal_hw_exception_event(ihre->pasid); ++ else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ++ ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { ++ struct kfd_vm_fault_info info; ++ ++ kfd_process_vm_fault(dev->dqm, ihre->pasid); ++ ++ memset(&info, 0, sizeof(info)); ++ dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); ++ if (!info.page_addr && !info.status) ++ return; ++ ++ if (info.vmid == ihre->vmid) ++ kfd_signal_vm_fault_event(dev, ihre->pasid, &info); ++ else ++ kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); ++ } + } + + const struct kfd_event_interrupt_class event_interrupt_class_cik = { +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h +index 109298b..ff8255d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_int.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h +@@ -26,17 +26,32 @@ + #include <linux/types.h> + + struct cik_ih_ring_entry { +- uint32_t source_id; +- uint32_t data; +- uint32_t ring_id; +- uint32_t reserved; ++ uint32_t source_id:8; ++ uint32_t reserved1:8; ++ uint32_t reserved2:16; ++ ++ uint32_t data:28; ++ uint32_t reserved3:4; ++ ++ /* pipeid, meid and unused3 are officially called RINGID, ++ * but for our purposes, they always decode into pipe and ME. ++ */ ++ uint32_t pipeid:2; ++ uint32_t meid:2; ++ uint32_t reserved4:4; ++ uint32_t vmid:8; ++ uint32_t pasid:16; ++ ++ uint32_t reserved5; + }; + ++#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 + #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 + #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 +-#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 +-#define CIK_INTSRC_SDMA_TRAP 0xE0 + #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF ++#define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 ++#define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 ++#define CIK_INTSRC_SDMA_TRAP 0xE0 + + #endif + +diff --git a/drivers/gpu/drm/amd/amdkfd/cik_regs.h b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +index 48769d1..37ce6dd 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cik_regs.h ++++ b/drivers/gpu/drm/amd/amdkfd/cik_regs.h +@@ -33,7 +33,8 @@ + #define APE1_MTYPE(x) ((x) << 7) + + /* valid for both DEFAULT_MTYPE and APE1_MTYPE */ +-#define MTYPE_CACHED 0 ++#define MTYPE_CACHED_NV 0 ++#define MTYPE_CACHED 1 + #define MTYPE_NONCACHED 3 + + #define DEFAULT_CP_HQD_PERSISTENT_STATE (0x33U << 8) +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +index 997a383d..751cc2e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +@@ -74,7 +74,7 @@ var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_D + /*************************************************************************/ + /* control on how to run the shader */ + /*************************************************************************/ +-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) ++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) + var EMU_RUN_HACK = 0 + var EMU_RUN_HACK_RESTORE_NORMAL = 0 + var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 +@@ -88,9 +88,9 @@ var WG_BASE_ADDR_HI = 0x0 + var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem + var CTX_SAVE_CONTROL = 0x0 + var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL +-var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) ++var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) + var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write +-var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes ++var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes + var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing + + /**************************************************************************/ +@@ -149,7 +149,7 @@ var s_save_spi_init_lo = exec_lo + var s_save_spi_init_hi = exec_hi + + //tba_lo and tba_hi need to be saved/restored +-var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} ++var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} + var s_save_pc_hi = ttmp1 + var s_save_exec_lo = ttmp2 + var s_save_exec_hi = ttmp3 +@@ -1048,7 +1048,7 @@ end + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + +- s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time ++ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time + + if G8SR_DEBUG_TIMESTAMP + s_memrealtime s_g8sr_ts_restore_d +diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +new file mode 100644 +index 0000000..bd2957c +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +@@ -0,0 +1,1411 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#if 0 ++HW (GFX9) source code for CWSR trap handler ++#Version 18 + multiple trap handler ++ ++// this performance-optimal version was originally from Seven Xu at SRDC ++ ++// Revison #18 --... ++/* Rev History ++** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) ++** #4. SR Memory Layout: ++** 1. VGPR-SGPR-HWREG-{LDS} ++** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. ++** #5. Update: 1. Accurate g8sr_ts_save_d timestamp ++** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) ++** #7. Update: 1. don't barrier if noLDS ++** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version ++** 2. Fix SQ issue by s_sleep 2 ++** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last ++** 2. optimize s_buffer save by burst 16sgprs... ++** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. ++** #11. Update 1. Add 2 more timestamp for debug version ++** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance ++** #13. Integ 1. Always use MUBUF for PV trap shader... ++** #14. Update 1. s_buffer_store soft clause... ++** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. ++** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree ++** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] ++** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... ++** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 ++** 2. FUNC - Handle non-CWSR traps ++*/ ++ ++var G8SR_WDMEM_HWREG_OFFSET = 0 ++var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes ++ ++// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. ++ ++var G8SR_DEBUG_TIMESTAMP = 0 ++var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset ++var s_g8sr_ts_save_s = s[34:35] // save start ++var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi ++var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ ++var s_g8sr_ts_save_d = s[40:41] // save end ++var s_g8sr_ts_restore_s = s[42:43] // restore start ++var s_g8sr_ts_restore_d = s[44:45] // restore end ++ ++var G8SR_VGPR_SR_IN_DWX4 = 0 ++var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes ++var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 ++ ++ ++/*************************************************************************/ ++/* control on how to run the shader */ ++/*************************************************************************/ ++//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) ++var EMU_RUN_HACK = 0 ++var EMU_RUN_HACK_RESTORE_NORMAL = 0 ++var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 ++var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 ++var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK ++var SAVE_LDS = 1 ++var WG_BASE_ADDR_LO = 0x9000a000 ++var WG_BASE_ADDR_HI = 0x0 ++var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem ++var CTX_SAVE_CONTROL = 0x0 ++var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL ++var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) ++var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write ++var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes ++var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing ++var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency ++ ++/**************************************************************************/ ++/* variables */ ++/**************************************************************************/ ++var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 ++var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 ++var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 ++var SQ_WAVE_STATUS_HALT_MASK = 0x2000 ++ ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 ++var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 ++var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 ++var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits ++ ++var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 ++var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask ++var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 ++var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 ++var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 ++var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 ++var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 ++ ++var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME ++var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME ++var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME ++ ++var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 ++var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 ++ ++ ++/* Save */ ++var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes ++var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE ++ ++var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_SAVE_SPI_INIT_ATC_SHIFT = 27 ++var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used ++var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME ++var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME ++ ++var s_save_spi_init_lo = exec_lo ++var s_save_spi_init_hi = exec_hi ++ ++var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} ++var s_save_pc_hi = ttmp1 ++var s_save_exec_lo = ttmp2 ++var s_save_exec_hi = ttmp3 ++var s_save_status = ttmp4 ++var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine ++var s_save_xnack_mask_lo = ttmp6 ++var s_save_xnack_mask_hi = ttmp7 ++var s_save_buf_rsrc0 = ttmp8 ++var s_save_buf_rsrc1 = ttmp9 ++var s_save_buf_rsrc2 = ttmp10 ++var s_save_buf_rsrc3 = ttmp11 ++ ++var s_save_mem_offset = ttmp14 ++var s_save_alloc_size = s_save_trapsts //conflict ++var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) ++var s_save_m0 = ttmp15 ++ ++/* Restore */ ++var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE ++var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC ++ ++var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit ++var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 ++var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype ++var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 ++var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG ++var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 ++ ++var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT ++var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK ++var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ ++var s_restore_spi_init_lo = exec_lo ++var s_restore_spi_init_hi = exec_hi ++ ++var s_restore_mem_offset = ttmp12 ++var s_restore_alloc_size = ttmp3 ++var s_restore_tmp = ttmp6 ++var s_restore_mem_offset_save = s_restore_tmp //no conflict ++ ++var s_restore_m0 = s_restore_alloc_size //no conflict ++ ++var s_restore_mode = ttmp7 ++ ++var s_restore_pc_lo = ttmp0 ++var s_restore_pc_hi = ttmp1 ++var s_restore_exec_lo = ttmp14 ++var s_restore_exec_hi = ttmp15 ++var s_restore_status = ttmp4 ++var s_restore_trapsts = ttmp5 ++var s_restore_xnack_mask_lo = xnack_mask_lo ++var s_restore_xnack_mask_hi = xnack_mask_hi ++var s_restore_buf_rsrc0 = ttmp8 ++var s_restore_buf_rsrc1 = ttmp9 ++var s_restore_buf_rsrc2 = ttmp10 ++var s_restore_buf_rsrc3 = ttmp11 ++ ++/**************************************************************************/ ++/* trap handler entry points */ ++/**************************************************************************/ ++/* Shader Main*/ ++ ++shader main ++ asic(GFX9) ++ type(CS) ++ ++ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore ++ //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC ++ s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. ++ s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE ++ //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE ++ s_branch L_SKIP_RESTORE //NOT restore, SAVE actually ++ else ++ s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save ++ end ++ ++L_JUMP_TO_RESTORE: ++ s_branch L_RESTORE //restore ++ ++L_SKIP_RESTORE: ++ ++ s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC ++ s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save ++ s_cbranch_scc1 L_SAVE //this is the operation for save ++ ++ // ********* Handle non-CWSR traps ******************* ++if (!EMU_RUN_HACK) ++ // Illegal instruction is a non-maskable exception which blocks context save. ++ // Halt the wavefront and return from the trap. ++ s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK ++ s_cbranch_scc1 L_HALT_WAVE ++ ++ // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. ++ // Instead, halt the wavefront and return from the trap. ++ s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK ++ s_cbranch_scc0 L_NO_MEM_VIOL ++ ++L_HALT_WAVE: ++ // If STATUS.HALT is set then this fault must come from SQC instruction fetch. ++ // We cannot prevent further faults so just terminate the wavefront. ++ s_and_b32 ttmp8, s_save_status, SQ_WAVE_STATUS_HALT_MASK ++ s_cbranch_scc0 L_NOT_ALREADY_HALTED ++ s_endpgm ++L_NOT_ALREADY_HALTED: ++ s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK ++ ++ // If the PC points to S_ENDPGM then context save will fail if STATUS.HALT is set. ++ // Rewind the PC to prevent this from occurring. The debugger compensates for this. ++ s_sub_u32 ttmp0, ttmp0, 0x8 ++ s_subb_u32 ttmp1, ttmp1, 0x0 ++ ++ s_branch L_EXCP_CASE ++ ++L_NO_MEM_VIOL: ++ /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ ++ s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) ++ s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) ++ s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 ++ s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 ++ s_waitcnt lgkmcnt(0) ++ s_or_b32 ttmp7, ttmp8, ttmp9 ++ s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler ++ ++L_NO_NEXT_TRAP: ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception ++ s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. ++ s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 ++ s_addc_u32 ttmp1, ttmp1, 0 ++L_EXCP_CASE: ++ s_and_b32 ttmp1, ttmp1, 0xFFFF ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) ++ s_rfe_b64 [ttmp0, ttmp1] ++end ++ // ********* End handling of non-CWSR traps ******************* ++ ++/**************************************************************************/ ++/* save routine */ ++/**************************************************************************/ ++ ++L_SAVE: ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++end ++ ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ ++ s_mov_b32 s_save_tmp, 0 //clear saveCtx bit ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit ++ ++ s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK ++ s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY ++ s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp ++ s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS ++ s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG ++ ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp ++ ++ /* inform SPI the readiness and wait for SPI's go signal */ ++ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI ++ s_mov_b32 s_save_exec_hi, exec_hi ++ s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_sq_save_msg ++ s_waitcnt lgkmcnt(0) ++end ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC ++ end ++ ++ L_SLEEP: ++ s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 ++ ++ if (EMU_RUN_HACK) ++ ++ else ++ s_cbranch_execz L_SLEEP ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_spi_wrexec ++ s_waitcnt lgkmcnt(0) ++end ++ ++ /* setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_save_tmp, v9, 0 ++ s_lshr_b32 s_save_tmp, s_save_tmp, 6 ++ s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) ++ s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL ++ else ++ end ++ ++ ++ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo ++ s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited ++ s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE ++ ++ //FIXME right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi (might need to save them before using them?) ++ s_mov_b32 s_save_m0, m0 //save M0 ++ ++ /* global mem offset */ ++ s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ ++ ++ ++ /* save HW registers */ ++ ////////////////////////////// ++ ++ L_SAVE_HWREG: ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ ++ ++ s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 ++ ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ end ++ ++ write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC ++ write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC ++ write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) ++ write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) //STATUS ++ ++ //s_save_trapsts conflicts with s_save_alloc_size ++ s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) ++ write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS ++ ++ write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO ++ write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI ++ ++ //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 ++ s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE ++ write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) ++ ++ ++ ++ /* the first wave in the threadgroup */ ++ s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK // extract fisrt wave bit ++ s_mov_b32 s_save_exec_hi, 0x0 ++ s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] ++ ++ ++ /* save SGPRs */ ++ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 ++ //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 ++ s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0 ++ s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset ++ s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 ++ ++ s_mov_b32 m0, 0x0 //SGPR initial index value =0 ++ s_nop 0x0 //Manually inserted wait states ++ L_SAVE_SGPR_LOOP: ++ // SGPR is allocated in 16 SGPR granularity ++ s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] ++ s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] ++ s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] ++ s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] ++ s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] ++ s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] ++ s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] ++ s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] ++ ++ write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4 ++ s_add_u32 m0, m0, 16 //next sgpr index ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? ++ // restore s_save_buf_rsrc0,1 ++ //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo ++ s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo ++ ++ ++ ++ ++ /* save first 4 VGPR, then LDS save could use */ ++ // each wave will alloc 4 vgprs at least... ++ ///////////////////////////////////////////////////////////////////////////////////// ++ ++ s_mov_b32 s_save_mem_offset, 0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++end ++ ++ ++ ++ /* save LDS */ ++ ////////////////////////////// ++ ++ L_SAVE_LDS: ++ ++ // Change EXEC to all threads... ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE ++ ++ s_barrier //LDS is used? wait for other waves in the same TG ++ s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here ++ s_cbranch_scc0 L_SAVE_LDS_DONE ++ ++ // first wave do LDS save; ++ ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_save_mem_offset) ++ get_sgpr_size_bytes(s_save_tmp) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ ++var LDS_DMA_ENABLE = 0 ++var UNROLL = 0 ++if UNROLL==0 && LDS_DMA_ENABLE==1 ++ s_mov_b32 s3, 256*2 ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 ++ L_SAVE_LDS_LOOP: ++ //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? ++ if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ end ++ ++ s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? ++ ++elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss ++ // store from higest LDS address to lowest ++ s_mov_b32 s3, 256*2 ++ s_sub_u32 m0, s_save_alloc_size, s3 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 ++ s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... ++ s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest ++ s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc ++ s_nop 0 ++ s_nop 0 ++ s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes ++ s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved ++ s_add_u32 s0, s0,s_save_alloc_size ++ s_addc_u32 s1, s1, 0 ++ s_setpc_b64 s[0:1] ++ ++ ++ for var i =0; i< 128; i++ ++ // be careful to make here a 64Byte aligned address, which could improve performance... ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW ++ buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW ++ ++ if i!=127 ++ s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline ++ s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 ++ end ++ end ++ ++else // BUFFER_STORE ++ v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 ++ v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid ++ v_mul_i32_i24 v2, v3, 8 // tid*8 ++ v_mov_b32 v3, 256*2 ++ s_mov_b32 m0, 0x10000 ++ s_mov_b32 s0, s_save_buf_rsrc3 ++ s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF // disable add_tid ++ s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000 //DFMT ++ ++L_SAVE_LDS_LOOP_VECTOR: ++ ds_read_b64 v[0:1], v2 //x =LDS[a], byte address ++ s_waitcnt lgkmcnt(0) ++ buffer_store_dwordx2 v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1 glc:1 slc:1 ++// s_waitcnt vmcnt(0) ++// v_add_u32 v2, vcc[0:1], v2, v3 ++ v_add_u32 v2, v2, v3 ++ v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size ++ s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR ++ ++ // restore rsrc3 ++ s_mov_b32 s_save_buf_rsrc3, s0 ++ ++end ++ ++L_SAVE_LDS_DONE: ++ ++ ++ /* save VGPRs - set the Rest VGPRs */ ++ ////////////////////////////////////////////////////////////////////////////////////// ++ L_SAVE_VGPR: ++ // VGPR SR memory offset: 0 ++ // TODO rearrange the RSRC words to use swizzle for VGPR save... ++ ++ s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 ++ s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible ++ s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ ++ // VGPR Allocated in 4-GPR granularity ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, 4 // skip first 4 VGPRs ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs ++ ++ s_set_gpr_idx_on m0, 0x1 // This will change M0 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 ++L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 // v0 = v[0+m0] ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ ++ ++ buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ s_add_u32 m0, m0, 4 ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++L_SAVE_VGPR_LOOP_END: ++ ++ s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes ++else ++ // VGPR store using dw burst ++ s_mov_b32 m0, 0x4 //VGPR initial index value =0 ++ s_cmp_lt_u32 m0, s_save_alloc_size ++ s_cbranch_scc0 L_SAVE_VGPR_END ++ ++ ++ s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 ++ s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later ++ ++ L_SAVE_VGPR_LOOP: ++ v_mov_b32 v0, v0 //v0 = v[0+m0] ++ v_mov_b32 v1, v1 //v0 = v[0+m0] ++ v_mov_b32 v2, v2 //v0 = v[0+m0] ++ v_mov_b32 v3, v3 //v0 = v[0+m0] ++ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 ++ buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 ++ buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? ++ s_set_gpr_idx_off ++end ++ ++L_SAVE_VGPR_END: ++ ++ ++ ++ ++ ++ ++ /* S_PGM_END_SAVED */ //FIXME graphics ONLY ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) ++ s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] ++ s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 ++ s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over ++ s_rfe_b64 s_save_pc_lo //Return to the main shader program ++ else ++ end ++ ++// Save Done timestamp ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_save_d ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_save_mem_offset) ++ s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // Need reset rsrc2?? ++ s_mov_b32 m0, s_save_mem_offset ++ s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 ++end ++ ++ ++ s_branch L_END_PGM ++ ++ ++ ++/**************************************************************************/ ++/* restore routine */ ++/**************************************************************************/ ++ ++L_RESTORE: ++ /* Setup Resource Contants */ ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ //calculate wd_addr using absolute thread id ++ v_readlane_b32 s_restore_tmp, v9, 0 ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 ++ s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE ++ s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO ++ s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI ++ s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL ++ else ++ end ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_s ++ s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? ++ // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... ++ s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] ++ s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. ++end ++ ++ ++ ++ s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo ++ s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE ++ s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) ++ s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK ++ s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position ++ s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE ++ ++ /* global mem offset */ ++// s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 ++ ++ /* the first wave in the threadgroup */ ++ s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK ++ s_cbranch_scc0 L_RESTORE_VGPR ++ ++ /* restore LDS */ ++ ////////////////////////////// ++ L_RESTORE_LDS: ++ ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size ++ s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? ++ s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes ++ s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes ++ ++ // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG) ++ // ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? ++ ++ ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ s_mov_b32 m0, 0x0 //lds_offset initial value = 0 ++ ++ L_RESTORE_LDS_LOOP: ++ if (SAVE_LDS) ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW ++ end ++ s_add_u32 m0, m0, 256*2 // 128 DW ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? ++ ++ ++ /* restore VGPRs */ ++ ////////////////////////////// ++ L_RESTORE_VGPR: ++ // VGPR SR memory offset : 0 ++ s_mov_b32 s_restore_mem_offset, 0x0 ++ s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead ++ s_mov_b32 exec_hi, 0xFFFFFFFF ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++if G8SR_VGPR_SR_IN_DWX4 ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ ++ // the const stride for DWx4 is 4*4 bytes ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes ++ ++ s_mov_b32 m0, s_restore_alloc_size ++ s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 ++ ++L_RESTORE_VGPR_LOOP: ++ buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ s_waitcnt vmcnt(0) ++ s_sub_u32 m0, m0, 4 ++ v_mov_b32 v0, v0 // v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_cmp_eq_u32 m0, 0x8000 ++ s_cbranch_scc0 L_RESTORE_VGPR_LOOP ++ s_set_gpr_idx_off ++ ++ s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 ++ s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes ++ ++else ++ // VGPR load using dw burst ++ s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 ++ s_mov_b32 m0, 4 //VGPR initial index value = 1 ++ s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later ++ ++ L_RESTORE_VGPR_LOOP: ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 ++ end ++ s_waitcnt vmcnt(0) //ensure data ready ++ v_mov_b32 v0, v0 //v[0+m0] = v0 ++ v_mov_b32 v1, v1 ++ v_mov_b32 v2, v2 ++ v_mov_b32 v3, v3 ++ s_add_u32 m0, m0, 4 //next vgpr index ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes ++ s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? ++ s_set_gpr_idx_off ++ /* VGPR restore on v0 */ ++ if(USE_MTBUF_INSTEAD_OF_MUBUF) ++ tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 ++ else ++ buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 ++ buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 ++ buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 ++ buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 ++ end ++ ++end ++ ++ /* restore SGPRs */ ++ ////////////////////////////// ++ ++ // SGPR SR memory offset : size(VGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4 // restore SGPR from S[n] to S[0], by 16 sgprs group ++ // TODO, change RSRC word to rearrange memory layout for SGPRS ++ ++ s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 ++ s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) ++ ++ if (SGPR_SAVE_USE_SQC) ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes ++ else ++ s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) ++ end ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ s_mov_b32 m0, s_restore_alloc_size ++ ++ L_RESTORE_SGPR_LOOP: ++ read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) //PV: further performance improvement can be made ++ s_waitcnt lgkmcnt(0) //ensure data ready ++ ++ s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] ++ s_nop 0 // hazard SALU M0=> S_MOVREL ++ ++ s_movreld_b64 s0, s0 //s[0+m0] = s0 ++ s_movreld_b64 s2, s2 ++ s_movreld_b64 s4, s4 ++ s_movreld_b64 s6, s6 ++ s_movreld_b64 s8, s8 ++ s_movreld_b64 s10, s10 ++ s_movreld_b64 s12, s12 ++ s_movreld_b64 s14, s14 ++ ++ s_cmp_eq_u32 m0, 0 //scc = (m0 < s_restore_alloc_size) ? 1 : 0 ++ s_cbranch_scc0 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? ++ ++ /* restore HW registers */ ++ ////////////////////////////// ++ L_RESTORE_HWREG: ++ ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo ++ s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi ++end ++ ++ // HWREG SR memory offset : size(VGPR)+size(SGPR) ++ get_vgpr_size_bytes(s_restore_mem_offset) ++ get_sgpr_size_bytes(s_restore_tmp) ++ s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp ++ ++ ++ s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes ++ if (SWIZZLE_EN) ++ s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? ++ else ++ s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes ++ end ++ ++ read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 ++ read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC ++ read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //EXEC ++ read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) ++ read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) //STATUS ++ read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) //TRAPSTS ++ read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_LO ++ read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset) //XNACK_MASK_HI ++ read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) //MODE ++ ++ s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS ++ ++ s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS ++ ++ //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: ++ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) ++ s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal ++ s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over ++ end ++ ++ s_mov_b32 m0, s_restore_m0 ++ s_mov_b32 exec_lo, s_restore_exec_lo ++ s_mov_b32 exec_hi, s_restore_exec_hi ++ ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 ++ s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT ++ s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 ++ //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore ++ s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode ++ //reuse s_restore_m0 as a temp register ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT ++ s_mov_b32 s_restore_tmp, 0x0 //IB_STS is zero ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT ++ s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT ++ s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 ++ s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK ++ s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT ++ s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp ++ ++ s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 ++ s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 ++ s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu ++ ++ s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time ++ ++if G8SR_DEBUG_TIMESTAMP ++ s_memrealtime s_g8sr_ts_restore_d ++ s_waitcnt lgkmcnt(0) ++end ++ ++// s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution ++ s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc ++ ++ ++/**************************************************************************/ ++/* the END */ ++/**************************************************************************/ ++L_END_PGM: ++ s_endpgm ++ ++end ++ ++ ++/**************************************************************************/ ++/* the helper functions */ ++/**************************************************************************/ ++ ++//Only for save hwreg to mem ++function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) ++ s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on ++ s_mov_b32 m0, s_mem_offset ++ s_buffer_store_dword s, s_rsrc, m0 glc:1 ++if ACK_SQC_STORE ++ s_waitcnt lgkmcnt(0) ++end ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++ s_mov_b32 m0, exec_lo ++end ++ ++ ++// HWREG are saved before SGPRs, so all HWREG could be use. ++function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) ++ ++ s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 ++if ACK_SQC_STORE ++ s_waitcnt lgkmcnt(0) ++end ++ s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 ++if ACK_SQC_STORE ++ s_waitcnt lgkmcnt(0) ++end ++ s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 ++if ACK_SQC_STORE ++ s_waitcnt lgkmcnt(0) ++end ++ s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 ++if ACK_SQC_STORE ++ s_waitcnt lgkmcnt(0) ++end ++ s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 ++ s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc ++end ++ ++ ++function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 ++ s_add_u32 s_mem_offset, s_mem_offset, 4 ++end ++ ++function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) ++ s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 ++ s_sub_u32 s_mem_offset, s_mem_offset, 4*16 ++end ++ ++ ++ ++function get_lds_size_bytes(s_lds_size_byte) ++ // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW ++ s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) // lds_size ++ s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW ++end ++ ++function get_vgpr_size_bytes(s_vgpr_size_byte) ++ s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size ++ s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 ++ s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible ++end ++ ++function get_sgpr_size_bytes(s_sgpr_size_byte) ++ s_getreg_b32 s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size ++ s_add_u32 s_sgpr_size_byte, s_sgpr_size_byte, 1 ++ s_lshl_b32 s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4 (non-zero value) ++end ++ ++function get_hwreg_size_bytes ++ return 128 //HWREG size 128 bytes ++end ++ ++ ++ ++#endif ++ ++static const uint32_t cwsr_trap_gfx9_hex[] = { ++ 0xbf820001, 0xbf820130, ++ 0xb8f0f802, 0x89708670, ++ 0xb8f1f803, 0x8674ff71, ++ 0x00000400, 0xbf850023, ++ 0x8674ff71, 0x00000800, ++ 0xbf850003, 0x8674ff71, ++ 0x00000100, 0xbf840009, ++ 0x8674ff70, 0x00002000, ++ 0xbf840001, 0xbf810000, ++ 0x8770ff70, 0x00002000, ++ 0x80ec886c, 0x82ed806d, ++ 0xbf820010, 0xb8faf812, ++ 0xb8fbf813, 0x8efa887a, ++ 0xc00a1d3d, 0x00000000, ++ 0xbf8cc07f, 0x87737574, ++ 0xbf840002, 0xb970f802, ++ 0xbe801d74, 0xb8f1f803, ++ 0x8671ff71, 0x000001ff, ++ 0xbf850002, 0x806c846c, ++ 0x826d806d, 0x866dff6d, ++ 0x0000ffff, 0xb970f802, ++ 0xbe801f6c, 0x866dff6d, ++ 0x0000ffff, 0xbef60080, ++ 0xb9760283, 0xbef20068, ++ 0xbef30069, 0xb8f62407, ++ 0x8e769c76, 0x876d766d, ++ 0xb8f603c7, 0x8e769b76, ++ 0x876d766d, 0xb8f6f807, ++ 0x8676ff76, 0x00007fff, ++ 0xb976f807, 0xbeee007e, ++ 0xbeef007f, 0xbefe0180, ++ 0xbf900004, 0xbf8e0002, ++ 0xbf88fffe, 0xbef4007e, ++ 0x8675ff7f, 0x0000ffff, ++ 0x8775ff75, 0x00040000, ++ 0xbef60080, 0xbef700ff, ++ 0x00807fac, 0x8676ff7f, ++ 0x08000000, 0x8f768376, ++ 0x87777677, 0x8676ff7f, ++ 0x70000000, 0x8f768176, ++ 0x87777677, 0xbefb007c, ++ 0xbefa0080, 0xb8fa2a05, ++ 0x807a817a, 0x8e7a8a7a, ++ 0xb8f61605, 0x80768176, ++ 0x8e768676, 0x807a767a, ++ 0xbef60084, 0xbef600ff, ++ 0x01000000, 0xbefe007c, ++ 0xbefc007a, 0xc0611efa, ++ 0x0000007c, 0xbf8cc07f, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611b3a, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611b7a, ++ 0x0000007c, 0xbf8cc07f, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611bba, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611bfa, ++ 0x0000007c, 0xbf8cc07f, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611c3a, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0xb8f1f803, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611c7a, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0xbefe007c, ++ 0xbefc007a, 0xc0611cba, ++ 0x0000007c, 0xbf8cc07f, ++ 0x807a847a, 0xbefc007e, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611cfa, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0xb8fbf801, ++ 0xbefe007c, 0xbefc007a, ++ 0xc0611efa, 0x0000007c, ++ 0xbf8cc07f, 0x807a847a, ++ 0xbefc007e, 0x8676ff7f, ++ 0x04000000, 0xbeef0080, ++ 0x876f6f76, 0xb8fa2a05, ++ 0x807a817a, 0x8e7a8a7a, ++ 0xb8f11605, 0x80718171, ++ 0x8e718471, 0x8e768271, ++ 0xbef600ff, 0x01000000, ++ 0xbef20174, 0x80747a74, ++ 0x82758075, 0xbefc0080, ++ 0xbf800000, 0xbe802b00, ++ 0xbe822b02, 0xbe842b04, ++ 0xbe862b06, 0xbe882b08, ++ 0xbe8a2b0a, 0xbe8c2b0c, ++ 0xbe8e2b0e, 0xc06b003a, ++ 0x00000000, 0xbf8cc07f, ++ 0xc06b013a, 0x00000010, ++ 0xbf8cc07f, 0xc06b023a, ++ 0x00000020, 0xbf8cc07f, ++ 0xc06b033a, 0x00000030, ++ 0xbf8cc07f, 0x8074c074, ++ 0x82758075, 0x807c907c, ++ 0xbf0a717c, 0xbf85ffe7, ++ 0xbef40172, 0xbefa0080, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xbef600ff, 0x01000000, ++ 0xe0724000, 0x7a1d0000, ++ 0xe0724100, 0x7a1d0100, ++ 0xe0724200, 0x7a1d0200, ++ 0xe0724300, 0x7a1d0300, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8f14306, 0x8671c171, ++ 0xbf84002c, 0xbf8a0000, ++ 0x8676ff6f, 0x04000000, ++ 0xbf840028, 0x8e718671, ++ 0x8e718271, 0xbef60071, ++ 0xb8fa2a05, 0x807a817a, ++ 0x8e7a8a7a, 0xb8f61605, ++ 0x80768176, 0x8e768676, ++ 0x807a767a, 0x807aff7a, ++ 0x00000080, 0xbef600ff, ++ 0x01000000, 0xbefc0080, ++ 0xd28c0002, 0x000100c1, ++ 0xd28d0003, 0x000204c1, ++ 0xd1060002, 0x00011103, ++ 0x7e0602ff, 0x00000200, ++ 0xbefc00ff, 0x00010000, ++ 0xbe800077, 0x8677ff77, ++ 0xff7fffff, 0x8777ff77, ++ 0x00058000, 0xd8ec0000, ++ 0x00000002, 0xbf8cc07f, ++ 0xe0765000, 0x7a1d0002, ++ 0x68040702, 0xd0c9006a, ++ 0x0000e302, 0xbf87fff7, ++ 0xbef70000, 0xbefa00ff, ++ 0x00000400, 0xbefe00c1, ++ 0xbeff00c1, 0xb8f12a05, ++ 0x80718171, 0x8e718271, ++ 0x8e768871, 0xbef600ff, ++ 0x01000000, 0xbefc0084, ++ 0xbf0a717c, 0xbf840015, ++ 0xbf11017c, 0x8071ff71, ++ 0x00001000, 0x7e000300, ++ 0x7e020301, 0x7e040302, ++ 0x7e060303, 0xe0724000, ++ 0x7a1d0000, 0xe0724100, ++ 0x7a1d0100, 0xe0724200, ++ 0x7a1d0200, 0xe0724300, ++ 0x7a1d0300, 0x807c847c, ++ 0x807aff7a, 0x00000400, ++ 0xbf0a717c, 0xbf85ffef, ++ 0xbf9c0000, 0xbf8200c5, ++ 0xbef4007e, 0x8675ff7f, ++ 0x0000ffff, 0x8775ff75, ++ 0x00040000, 0xbef60080, ++ 0xbef700ff, 0x00807fac, ++ 0x8672ff7f, 0x08000000, ++ 0x8f728372, 0x87777277, ++ 0x8672ff7f, 0x70000000, ++ 0x8f728172, 0x87777277, ++ 0x8672ff7f, 0x04000000, ++ 0xbf84001e, 0xbefe00c1, ++ 0xbeff00c1, 0xb8ef4306, ++ 0x866fc16f, 0xbf840019, ++ 0x8e6f866f, 0x8e6f826f, ++ 0xbef6006f, 0xb8f82a05, ++ 0x80788178, 0x8e788a78, ++ 0xb8f21605, 0x80728172, ++ 0x8e728672, 0x80787278, ++ 0x8078ff78, 0x00000080, ++ 0xbef600ff, 0x01000000, ++ 0xbefc0080, 0xe0510000, ++ 0x781d0000, 0xe0510100, ++ 0x781d0000, 0x807cff7c, ++ 0x00000200, 0x8078ff78, ++ 0x00000200, 0xbf0a6f7c, ++ 0xbf85fff6, 0xbef80080, ++ 0xbefe00c1, 0xbeff00c1, ++ 0xb8ef2a05, 0x806f816f, ++ 0x8e6f826f, 0x8e76886f, ++ 0xbef600ff, 0x01000000, ++ 0xbef20078, 0x8078ff78, ++ 0x00000400, 0xbefc0084, ++ 0xbf11087c, 0x806fff6f, ++ 0x00008000, 0xe0524000, ++ 0x781d0000, 0xe0524100, ++ 0x781d0100, 0xe0524200, ++ 0x781d0200, 0xe0524300, ++ 0x781d0300, 0xbf8c0f70, ++ 0x7e000300, 0x7e020301, ++ 0x7e040302, 0x7e060303, ++ 0x807c847c, 0x8078ff78, ++ 0x00000400, 0xbf0a6f7c, ++ 0xbf85ffee, 0xbf9c0000, ++ 0xe0524000, 0x721d0000, ++ 0xe0524100, 0x721d0100, ++ 0xe0524200, 0x721d0200, ++ 0xe0524300, 0x721d0300, ++ 0xb8f82a05, 0x80788178, ++ 0x8e788a78, 0xb8f21605, ++ 0x80728172, 0x8e728672, ++ 0x80787278, 0x80f8c078, ++ 0xb8ef1605, 0x806f816f, ++ 0x8e6f846f, 0x8e76826f, ++ 0xbef600ff, 0x01000000, ++ 0xbefc006f, 0xc031003a, ++ 0x00000078, 0x80f8c078, ++ 0xbf8cc07f, 0x80fc907c, ++ 0xbf800000, 0xbe802d00, ++ 0xbe822d02, 0xbe842d04, ++ 0xbe862d06, 0xbe882d08, ++ 0xbe8a2d0a, 0xbe8c2d0c, ++ 0xbe8e2d0e, 0xbf06807c, ++ 0xbf84fff0, 0xb8f82a05, ++ 0x80788178, 0x8e788a78, ++ 0xb8f21605, 0x80728172, ++ 0x8e728672, 0x80787278, ++ 0xbef60084, 0xbef600ff, ++ 0x01000000, 0xc0211bfa, ++ 0x00000078, 0x80788478, ++ 0xc0211b3a, 0x00000078, ++ 0x80788478, 0xc0211b7a, ++ 0x00000078, 0x80788478, ++ 0xc0211eba, 0x00000078, ++ 0x80788478, 0xc0211efa, ++ 0x00000078, 0x80788478, ++ 0xc0211c3a, 0x00000078, ++ 0x80788478, 0xc0211c7a, ++ 0x00000078, 0x80788478, ++ 0xc0211a3a, 0x00000078, ++ 0x80788478, 0xc0211a7a, ++ 0x00000078, 0x80788478, ++ 0xc0211cfa, 0x00000078, ++ 0x80788478, 0xbf8cc07f, ++ 0x866dff6d, 0x0000ffff, ++ 0xbefc006f, 0xbefe007a, ++ 0xbeff007b, 0x866f71ff, ++ 0x000003ff, 0xb96f4803, ++ 0x866f71ff, 0xfffff800, ++ 0x8f6f8b6f, 0xb96fa2c3, ++ 0xb973f801, 0x866fff6d, ++ 0xf0000000, 0x8f6f9c6f, ++ 0x8e6f906f, 0xbef20080, ++ 0x87726f72, 0x866fff6d, ++ 0x08000000, 0x8f6f9b6f, ++ 0x8e6f8f6f, 0x87726f72, ++ 0x866fff70, 0x00800000, ++ 0x8f6f976f, 0xb972f807, ++ 0x86fe7e7e, 0x86ea6a6a, ++ 0xb970f802, 0xbf8a0000, ++ 0x95806f6c, 0xbf810000, ++}; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index 62c3d9c..d5783be 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -25,6 +25,7 @@ + #include <linux/err.h> + #include <linux/fs.h> + #include <linux/sched.h> ++#include <linux/sched/mm.h> + #include <linux/slab.h> + #include <linux/uaccess.h> + #include <linux/compat.h> +@@ -33,13 +34,17 @@ + #include <linux/mm.h> + #include <linux/mman.h> + #include <asm/processor.h> ++#include <linux/ptrace.h> ++ + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" + #include "kfd_dbgmgr.h" ++#include "kfd_ipc.h" + + static long kfd_ioctl(struct file *, unsigned int, unsigned long); + static int kfd_open(struct inode *, struct file *); + static int kfd_mmap(struct file *, struct vm_area_struct *); ++static bool kfd_dev_is_large_bar(struct kfd_dev *dev); + + static const char kfd_dev_name[] = "kfd"; + +@@ -55,6 +60,14 @@ static int kfd_char_dev_major = -1; + static struct class *kfd_class; + struct device *kfd_device; + ++static char *kfd_devnode(struct device *dev, umode_t *mode) ++{ ++ if (mode && dev->devt == MKDEV(kfd_char_dev_major, 0)) ++ *mode = 0666; ++ ++ return NULL; ++} ++ + int kfd_chardev_init(void) + { + int err = 0; +@@ -69,6 +82,8 @@ int kfd_chardev_init(void) + if (IS_ERR(kfd_class)) + goto err_class_create; + ++ kfd_class->devnode = kfd_devnode; ++ + kfd_device = device_create(kfd_class, NULL, + MKDEV(kfd_char_dev_major, 0), + NULL, kfd_dev_name); +@@ -291,8 +306,16 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, + + + /* Return gpu_id as doorbell offset for mmap usage */ +- args->doorbell_offset = (KFD_MMAP_DOORBELL_MASK | args->gpu_id); ++ args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL; ++ args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id); + args->doorbell_offset <<= PAGE_SHIFT; ++ if (KFD_IS_SOC15(dev->device_info->asic_family)) ++ /* On SOC15 ASICs, doorbell allocation must be ++ * per-device, and independent from the per-process ++ * queue_id. Return the doorbell offset within the ++ * doorbell aperture to user mode. ++ */ ++ args->doorbell_offset |= q_properties.doorbell_off; + + mutex_unlock(&p->mutex); + +@@ -380,6 +403,58 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, + return retval; + } + ++static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, ++ void *data) ++{ ++ int retval; ++ const int max_num_cus = 1024; ++ struct kfd_ioctl_set_cu_mask_args *args = data; ++ struct queue_properties properties; ++ uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; ++ size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); ++ ++ if ((args->num_cu_mask % 32) != 0) { ++ pr_debug("num_cu_mask 0x%x must be a multiple of 32", ++ args->num_cu_mask); ++ return -EINVAL; ++ } ++ ++ properties.cu_mask_count = args->num_cu_mask; ++ if (properties.cu_mask_count == 0) { ++ pr_debug("CU mask cannot be 0"); ++ return -EINVAL; ++ } ++ ++ /* To prevent an unreasonably large CU mask size, set an arbitrary ++ * limit of max_num_cus bits. We can then just drop any CU mask bits ++ * past max_num_cus bits and just use the first max_num_cus bits. ++ */ ++ if (properties.cu_mask_count > max_num_cus) { ++ pr_debug("CU mask cannot be greater than 1024 bits"); ++ properties.cu_mask_count = max_num_cus; ++ cu_mask_size = sizeof(uint32_t) * (max_num_cus/32); ++ } ++ ++ properties.cu_mask = kzalloc(cu_mask_size, GFP_KERNEL); ++ if (!properties.cu_mask) ++ return -ENOMEM; ++ ++ retval = copy_from_user(properties.cu_mask, cu_mask_ptr, cu_mask_size); ++ if (retval) { ++ pr_debug("Could not copy CU mask from userspace"); ++ kfree(properties.cu_mask); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&p->mutex); ++ ++ retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); ++ ++ mutex_unlock(&p->mutex); ++ ++ return retval; ++} ++ + static int kfd_ioctl_set_memory_policy(struct file *filep, + struct kfd_process *p, void *data) + { +@@ -441,7 +516,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, + struct kfd_process_device *pdd; + + dev = kfd_device_by_id(args->gpu_id); +- if (dev == NULL) ++ if (!dev) + return -EINVAL; + + mutex_lock(&p->mutex); +@@ -478,11 +553,6 @@ static int kfd_ioctl_dbg_register(struct file *filep, + if (!dev) + return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); +- return -EINVAL; +- } +- + mutex_lock(&p->mutex); + mutex_lock(kfd_get_dbgmgr_mutex()); + +@@ -529,11 +599,6 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, + if (!dev || !dev->dbgmgr) + return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n"); +- return -EINVAL; +- } +- + mutex_lock(kfd_get_dbgmgr_mutex()); + + status = kfd_dbgmgr_unregister(dev->dbgmgr, p); +@@ -574,11 +639,6 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, + if (!dev) + return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); +- return -EINVAL; +- } +- + cmd_from_user = (void __user *) args->content_ptr; + + /* Validate arguments */ +@@ -682,11 +742,6 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, + if (!dev) + return -EINVAL; + +- if (dev->device_info->asic_family == CHIP_CARRIZO) { +- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); +- return -EINVAL; +- } +- + /* input size must match the computed "compact" size */ + if (args->buf_size_in_bytes != computed_buff_size) { + pr_debug("size mismatch, computed : actual %u : %u\n", +@@ -748,12 +803,13 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, + struct timespec64 time; + + dev = kfd_device_by_id(args->gpu_id); +- if (dev == NULL) +- return -EINVAL; +- +- /* Reading GPU clock counter from KGD */ +- args->gpu_clock_counter = +- dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); ++ if (dev) ++ /* Reading GPU clock counter from KGD */ ++ args->gpu_clock_counter = ++ dev->kfd2kgd->get_gpu_clock_counter(dev->kgd); ++ else ++ /* Node without GPU resource */ ++ args->gpu_clock_counter = 0; + + /* No access to rdtsc. Using raw monotonic time */ + getrawmonotonic64(&time); +@@ -825,18 +881,151 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, + return 0; + } + ++static int kfd_ioctl_get_process_apertures_new(struct file *filp, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_get_process_apertures_new_args *args = data; ++ struct kfd_process_device_apertures *pa; ++ struct kfd_process_device *pdd; ++ uint32_t nodes = 0; ++ int ret; ++ ++ dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); ++ ++ if (args->num_of_nodes == 0) { ++ /* Return number of nodes, so that user space can alloacate ++ * sufficient memory ++ */ ++ mutex_lock(&p->mutex); ++ ++ if (!kfd_has_process_device_data(p)) ++ goto out_upwrite; ++ ++ /* Run over all pdd of the process */ ++ pdd = kfd_get_first_process_device_data(p); ++ do { ++ args->num_of_nodes++; ++ pdd = kfd_get_next_process_device_data(p, pdd); ++ } while (pdd); ++ ++ goto out_upwrite; ++ } ++ ++ /* Fill in process-aperture information for all available ++ * nodes, but not more than args->num_of_nodes as that is ++ * the amount of memory allocated by user ++ */ ++ pa = kzalloc((sizeof(struct kfd_process_device_apertures) * ++ args->num_of_nodes), GFP_KERNEL); ++ if (!pa) ++ return -ENOMEM; ++ ++ mutex_lock(&p->mutex); ++ ++ if (!kfd_has_process_device_data(p)) { ++ args->num_of_nodes = 0; ++ kfree(pa); ++ goto out_upwrite; ++ } ++ ++ /* Run over all pdd of the process */ ++ pdd = kfd_get_first_process_device_data(p); ++ do { ++ pa[nodes].gpu_id = pdd->dev->id; ++ pa[nodes].lds_base = pdd->lds_base; ++ pa[nodes].lds_limit = pdd->lds_limit; ++ pa[nodes].gpuvm_base = pdd->gpuvm_base; ++ pa[nodes].gpuvm_limit = pdd->gpuvm_limit; ++ pa[nodes].scratch_base = pdd->scratch_base; ++ pa[nodes].scratch_limit = pdd->scratch_limit; ++ ++ dev_dbg(kfd_device, ++ "gpu id %u\n", pdd->dev->id); ++ dev_dbg(kfd_device, ++ "lds_base %llX\n", pdd->lds_base); ++ dev_dbg(kfd_device, ++ "lds_limit %llX\n", pdd->lds_limit); ++ dev_dbg(kfd_device, ++ "gpuvm_base %llX\n", pdd->gpuvm_base); ++ dev_dbg(kfd_device, ++ "gpuvm_limit %llX\n", pdd->gpuvm_limit); ++ dev_dbg(kfd_device, ++ "scratch_base %llX\n", pdd->scratch_base); ++ dev_dbg(kfd_device, ++ "scratch_limit %llX\n", pdd->scratch_limit); ++ nodes++; ++ ++ pdd = kfd_get_next_process_device_data(p, pdd); ++ } while (pdd && (nodes < args->num_of_nodes)); ++ mutex_unlock(&p->mutex); ++ ++ args->num_of_nodes = nodes; ++ ret = copy_to_user( ++ (void __user *)args->kfd_process_device_apertures_ptr, ++ pa, ++ (nodes * sizeof(struct kfd_process_device_apertures))); ++ kfree(pa); ++ return ret ? -EFAULT : 0; ++ ++out_upwrite: ++ mutex_unlock(&p->mutex); ++ return 0; ++} ++ + static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, + void *data) + { + struct kfd_ioctl_create_event_args *args = data; +- int err; ++ struct kfd_dev *kfd; ++ struct kfd_process_device *pdd; ++ int err = -EINVAL; ++ void *mem, *kern_addr = NULL; ++ ++ pr_debug("Event page offset 0x%llx\n", args->event_page_offset); ++ ++ if (args->event_page_offset) { ++ kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); ++ if (!kfd) { ++ pr_err("Getting device by id failed in %s\n", __func__); ++ return -EFAULT; ++ } ++ if (!kfd->device_info->needs_iommu_device) { ++ mutex_lock(&p->mutex); ++ pdd = kfd_bind_process_to_device(kfd, p); ++ if (IS_ERR(pdd)) { ++ err = PTR_ERR(pdd); ++ goto out_upwrite; ++ } ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->event_page_offset)); ++ if (!mem) { ++ pr_err("Can't find BO, offset is 0x%llx\n", ++ args->event_page_offset); ++ err = -EFAULT; ++ goto out_upwrite; ++ } ++ mutex_unlock(&p->mutex); ++ ++ /* Map dGPU gtt BO to kernel */ ++ kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, ++ mem, &kern_addr, NULL); ++ } ++ } ++ ++ err = kfd_event_create(filp, p, ++ args->event_type, ++ args->auto_reset != 0, ++ args->node_id, ++ &args->event_id, ++ &args->event_trigger_data, ++ &args->event_page_offset, ++ &args->event_slot_index, ++ kern_addr); + +- err = kfd_event_create(filp, p, args->event_type, +- args->auto_reset != 0, args->node_id, +- &args->event_id, &args->event_trigger_data, +- &args->event_page_offset, +- &args->event_slot_index); ++ return err; + ++out_upwrite: ++ mutex_unlock(&p->mutex); + return err; + } + +@@ -877,14 +1066,17 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, + + return err; + } +-static int kfd_ioctl_set_scratch_backing_va(struct file *filep, ++static int kfd_ioctl_alloc_scratch_memory(struct file *filep, + struct kfd_process *p, void *data) + { +- struct kfd_ioctl_set_scratch_backing_va_args *args = data; ++ struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; + struct kfd_process_device *pdd; + struct kfd_dev *dev; + long err; + ++ if (args->size == 0) ++ return -EINVAL; ++ + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; +@@ -901,15 +1093,20 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, + + mutex_unlock(&p->mutex); + +- if (sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0) +- dev->kfd2kgd->set_scratch_backing_va( ++ if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && ++ pdd->qpd.vmid != 0) { ++ err = dev->kfd2kgd->alloc_memory_of_scratch( + dev->kgd, args->va_addr, pdd->qpd.vmid); ++ if (err != 0) ++ goto alloc_memory_of_scratch_failed; ++ } + + return 0; + + bind_process_to_device_fail: + mutex_unlock(&p->mutex); +- return err; ++alloc_memory_of_scratch_failed: ++ return -EFAULT; + } + + static int kfd_ioctl_get_tile_config(struct file *filep, +@@ -954,6 +1151,770 @@ static int kfd_ioctl_get_tile_config(struct file *filep, + return 0; + } + ++bool kfd_dev_is_large_bar(struct kfd_dev *dev) ++{ ++ struct kfd_local_mem_info mem_info; ++ ++ if (debug_largebar) { ++ pr_debug("Simulate large-bar allocation on non large-bar machine\n"); ++ return true; ++ } ++ ++ if (dev->device_info->needs_iommu_device) ++ return false; ++ ++ dev->kfd2kgd->get_local_mem_info(dev->kgd, &mem_info); ++ if (mem_info.local_mem_size_private == 0 && ++ mem_info.local_mem_size_public > 0) ++ return true; ++ return false; ++} ++ ++static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_alloc_memory_of_gpu_args *args = data; ++ struct kfd_process_device *pdd; ++ void *mem; ++ struct kfd_dev *dev; ++ int idr_handle; ++ long err; ++ uint64_t offset = args->mmap_offset; ++ uint32_t flags = args->flags; ++ struct vm_area_struct *vma; ++ ++ if (args->size == 0) ++ return -EINVAL; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) ++ return -EINVAL; ++ ++ if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) && ++ (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) && ++ !kfd_dev_is_large_bar(dev)) { ++ pr_err("Alloc host visible vram on small bar is not allowed\n"); ++ return -EINVAL; ++ } ++ ++ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { ++ /* Check if the userptr corresponds to another (or third-party) ++ * device local memory. If so treat is as a doorbell. User ++ * space will be oblivious of this and will use this doorbell ++ * BO as a regular userptr BO ++ */ ++ vma = find_vma(current->mm, args->mmap_offset); ++ if (vma && (vma->vm_flags & VM_IO)) { ++ unsigned long pfn; ++ ++ follow_pfn(vma, args->mmap_offset, &pfn); ++ flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; ++ flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; ++ offset = (pfn << PAGE_SHIFT); ++ } ++ } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { ++ if (args->size != kfd_doorbell_process_slice(dev)) ++ return -EINVAL; ++ offset = kfd_get_process_doorbells(dev, p); ++ } ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ err = PTR_ERR(pdd); ++ goto err_unlock; ++ } ++ ++ err = dev->kfd2kgd->alloc_memory_of_gpu( ++ dev->kgd, args->va_addr, args->size, ++ pdd->vm, (struct kgd_mem **) &mem, &offset, ++ flags); ++ ++ if (err) ++ goto err_unlock; ++ ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ args->va_addr, args->size, NULL); ++ if (idr_handle < 0) { ++ err = -EFAULT; ++ goto err_free; ++ } ++ ++ mutex_unlock(&p->mutex); ++ ++ args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); ++ args->mmap_offset = offset; ++ ++ return 0; ++ ++err_free: ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, ++ (struct kgd_mem *) mem); ++err_unlock: ++ mutex_unlock(&p->mutex); ++ return err; ++} ++ ++static int kfd_ioctl_free_memory_of_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_free_memory_of_gpu_args *args = data; ++ struct kfd_process_device *pdd; ++ struct kfd_bo *buf_obj; ++ struct kfd_dev *dev; ++ int ret; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (!dev) ++ return -EINVAL; ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_get_process_device_data(dev, p); ++ if (!pdd) { ++ pr_err("Process device data doesn't exist\n"); ++ ret = -EINVAL; ++ goto err_unlock; ++ } ++ ++ buf_obj = kfd_process_device_find_bo(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ if (!buf_obj) { ++ ret = -EINVAL; ++ goto err_unlock; ++ } ++ run_rdma_free_callback(buf_obj); ++ ++ ret = dev->kfd2kgd->free_memory_of_gpu(dev->kgd, buf_obj->mem); ++ ++ /* If freeing the buffer failed, leave the handle in place for ++ * clean-up during process tear-down. ++ */ ++ if (ret == 0) ++ kfd_process_device_remove_obj_handle( ++ pdd, GET_IDR_HANDLE(args->handle)); ++ ++err_unlock: ++ mutex_unlock(&p->mutex); ++ return ret; ++} ++ ++static int kfd_ioctl_map_memory_to_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_map_memory_to_gpu_args *args = data; ++ struct kfd_process_device *pdd, *peer_pdd; ++ void *mem; ++ struct kfd_dev *dev, *peer; ++ long err = 0; ++ int i, num_dev = 0; ++ uint32_t *devices_arr = NULL; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (!dev) ++ return -EINVAL; ++ ++ if (args->device_ids_array_size == 0) { ++ pr_debug("Device ID array size is 0\n"); ++ return -EINVAL; ++ } ++ ++ if (args->device_ids_array_size % sizeof(uint32_t)) { ++ pr_debug("Node IDs array size %u\n", ++ args->device_ids_array_size); ++ return -EINVAL; ++ } ++ ++ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); ++ if (!devices_arr) ++ return -ENOMEM; ++ ++ err = copy_from_user(devices_arr, ++ (void __user *)args->device_ids_array_ptr, ++ args->device_ids_array_size); ++ if (err != 0) { ++ err = -EFAULT; ++ goto copy_from_user_failed; ++ } ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ err = PTR_ERR(pdd); ++ goto bind_process_to_device_failed; ++ } ++ ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ if (!mem) { ++ err = -ENOMEM; ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ num_dev = args->device_ids_array_size / sizeof(uint32_t); ++ for (i = 0 ; i < num_dev; i++) { ++ peer = kfd_device_by_id(devices_arr[i]); ++ if (!peer) { ++ pr_debug("Getting device by id failed for 0x%x\n", ++ devices_arr[i]); ++ err = -EINVAL; ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ peer_pdd = kfd_bind_process_to_device(peer, p); ++ if (IS_ERR(peer_pdd)) { ++ err = PTR_ERR(peer_pdd); ++ goto get_mem_obj_from_handle_failed; ++ } ++ err = peer->kfd2kgd->map_memory_to_gpu( ++ peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); ++ if (err != 0) { ++ pr_err("Failed to map to gpu %d, num_dev=%d\n", ++ i, num_dev); ++ goto map_memory_to_gpu_failed; ++ } ++ } ++ ++ mutex_unlock(&p->mutex); ++ ++ err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true); ++ if (err) { ++ pr_debug("Sync memory failed, wait interrupted by user signal\n"); ++ goto sync_memory_failed; ++ } ++ ++ /* Flush TLBs after waiting for the page table updates to complete */ ++ for (i = 0; i < num_dev; i++) { ++ peer = kfd_device_by_id(devices_arr[i]); ++ if (WARN_ON_ONCE(!peer)) ++ continue; ++ peer_pdd = kfd_get_process_device_data(peer, p); ++ if (WARN_ON_ONCE(!peer_pdd)) ++ continue; ++ kfd_flush_tlb(peer_pdd); ++ } ++ ++ kfree(devices_arr); ++ ++ return err; ++ ++bind_process_to_device_failed: ++get_mem_obj_from_handle_failed: ++map_memory_to_gpu_failed: ++ mutex_unlock(&p->mutex); ++copy_from_user_failed: ++sync_memory_failed: ++ kfree(devices_arr); ++ ++ return err; ++} ++ ++static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; ++ struct kfd_process_device *pdd, *peer_pdd; ++ void *mem; ++ struct kfd_dev *dev, *peer; ++ long err = 0; ++ uint32_t *devices_arr = NULL, num_dev, i; ++ ++ dev = kfd_device_by_id(GET_GPU_ID(args->handle)); ++ if (!dev) ++ return -EINVAL; ++ ++ if (args->device_ids_array_size == 0) { ++ pr_debug("Device ID array size is 0\n"); ++ return -EINVAL; ++ } ++ ++ if (args->device_ids_array_size % sizeof(uint32_t)) { ++ pr_debug("Node IDs array size %u\n", ++ args->device_ids_array_size); ++ return -EINVAL; ++ } ++ ++ devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); ++ if (!devices_arr) ++ return -ENOMEM; ++ ++ err = copy_from_user(devices_arr, ++ (void __user *)args->device_ids_array_ptr, ++ args->device_ids_array_size); ++ if (err != 0) { ++ err = -EFAULT; ++ goto copy_from_user_failed; ++ } ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_get_process_device_data(dev, p); ++ if (!pdd) { ++ pr_debug("Process device data doesn't exist\n"); ++ err = -ENODEV; ++ goto bind_process_to_device_failed; ++ } ++ ++ mem = kfd_process_device_translate_handle(pdd, ++ GET_IDR_HANDLE(args->handle)); ++ if (!mem) { ++ err = -ENOMEM; ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ num_dev = args->device_ids_array_size / sizeof(uint32_t); ++ for (i = 0 ; i < num_dev; i++) { ++ peer = kfd_device_by_id(devices_arr[i]); ++ if (!peer) { ++ err = -EINVAL; ++ goto get_mem_obj_from_handle_failed; ++ } ++ ++ peer_pdd = kfd_get_process_device_data(peer, p); ++ if (!peer_pdd) { ++ err = -ENODEV; ++ goto get_mem_obj_from_handle_failed; ++ } ++ err = dev->kfd2kgd->unmap_memory_to_gpu( ++ peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); ++ if (err) { ++ pr_err("Failed to unmap from gpu %d/%d\n", ++ i, num_dev); ++ goto unmap_memory_from_gpu_failed; ++ } ++ } ++ kfree(devices_arr); ++ ++ mutex_unlock(&p->mutex); ++ ++ return 0; ++ ++bind_process_to_device_failed: ++get_mem_obj_from_handle_failed: ++unmap_memory_from_gpu_failed: ++ mutex_unlock(&p->mutex); ++copy_from_user_failed: ++ kfree(devices_arr); ++ return err; ++} ++ ++static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; ++ struct kfd_dev *dev; ++ struct kfd_process_device *pdd; ++ long err; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) ++ return -EINVAL; ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ err = PTR_ERR(pdd); ++ goto exit; ++ } ++ ++ err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, ++ args->dgpu_limit); ++ ++exit: ++ mutex_unlock(&p->mutex); ++ return err; ++} ++ ++static int kfd_ioctl_get_dmabuf_info(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_get_dmabuf_info_args *args = data; ++ struct kfd_dev *dev = NULL; ++ struct kgd_dev *dma_buf_kgd; ++ void *metadata_buffer = NULL; ++ uint32_t flags; ++ unsigned int i; ++ int r; ++ ++ /* Find a KFD GPU device that supports the get_dmabuf_info query */ ++ for (i = 0; kfd_topology_enum_kfd_devices(i, &dev) == 0; i++) ++ if (dev && dev->kfd2kgd->get_dmabuf_info) ++ break; ++ if (!dev) ++ return -EINVAL; ++ ++ if (args->metadata_ptr) { ++ metadata_buffer = kzalloc(args->metadata_size, GFP_KERNEL); ++ if (!metadata_buffer) ++ return -ENOMEM; ++ } ++ ++ /* Get dmabuf info from KGD */ ++ r = dev->kfd2kgd->get_dmabuf_info(dev->kgd, args->dmabuf_fd, ++ &dma_buf_kgd, &args->size, ++ metadata_buffer, args->metadata_size, ++ &args->metadata_size, &flags); ++ if (r) ++ goto exit; ++ ++ /* Reverse-lookup gpu_id from kgd pointer */ ++ dev = kfd_device_by_kgd(dma_buf_kgd); ++ if (!dev) { ++ r = -EINVAL; ++ goto exit; ++ } ++ args->gpu_id = dev->id; ++ args->flags = flags; ++ ++ /* Copy metadata buffer to user mode */ ++ if (metadata_buffer) { ++ r = copy_to_user((void __user *)args->metadata_ptr, ++ metadata_buffer, args->metadata_size); ++ if (r != 0) ++ r = -EFAULT; ++ } ++ ++exit: ++ kfree(metadata_buffer); ++ ++ return r; ++} ++ ++static int kfd_ioctl_import_dmabuf(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_import_dmabuf_args *args = data; ++ struct kfd_dev *dev; ++ int r; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) ++ return -EINVAL; ++ ++ r = kfd_ipc_import_dmabuf(dev, p, args->gpu_id, args->dmabuf_fd, ++ args->va_addr, &args->handle, NULL); ++ if (r) ++ pr_err("Failed to import dmabuf\n"); ++ ++ return r; ++} ++ ++static int kfd_ioctl_ipc_export_handle(struct file *filep, ++ struct kfd_process *p, ++ void *data) ++{ ++ struct kfd_ioctl_ipc_export_handle_args *args = data; ++ struct kfd_dev *dev; ++ int r; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) ++ return -EINVAL; ++ ++ r = kfd_ipc_export_as_handle(dev, p, args->handle, args->share_handle); ++ if (r) ++ pr_err("Failed to export IPC handle\n"); ++ ++ return r; ++} ++ ++static int kfd_ioctl_ipc_import_handle(struct file *filep, ++ struct kfd_process *p, ++ void *data) ++{ ++ struct kfd_ioctl_ipc_import_handle_args *args = data; ++ struct kfd_dev *dev = NULL; ++ int r; ++ ++ dev = kfd_device_by_id(args->gpu_id); ++ if (!dev) ++ return -EINVAL; ++ ++ r = kfd_ipc_import_handle(dev, p, args->gpu_id, args->share_handle, ++ args->va_addr, &args->handle, ++ &args->mmap_offset); ++ if (r) ++ pr_err("Failed to import IPC handle\n"); ++ ++ return r; ++} ++ ++static int kfd_ioctl_cross_memory_copy(struct file *filep, ++ struct kfd_process *local_p, void *data) ++{ ++ struct kfd_ioctl_cross_memory_copy_args *args = data; ++ struct kfd_memory_range *src_array, *dst_array; ++ struct kfd_bo *src_bo, *dst_bo; ++ struct kfd_process *remote_p, *src_p, *dst_p; ++ struct task_struct *remote_task; ++ struct mm_struct *remote_mm; ++ struct pid *remote_pid; ++ struct dma_fence *fence = NULL, *lfence = NULL; ++ uint64_t dst_va_addr; ++ uint64_t copied, total_copied = 0; ++ uint64_t src_offset, dst_offset, dst_va_addr_end; ++ const char *cma_op; ++ int i, j = 0, err = 0; ++ ++ /* Check parameters */ ++ if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || ++ args->src_mem_array_size == 0 || args->dst_mem_array_size == 0) ++ return -EINVAL; ++ args->bytes_copied = 0; ++ ++ /* Allocate space for source and destination arrays */ ++ src_array = kmalloc_array((args->src_mem_array_size + ++ args->dst_mem_array_size), ++ sizeof(struct kfd_memory_range), ++ GFP_KERNEL); ++ if (!src_array) ++ return -ENOMEM; ++ dst_array = &src_array[args->src_mem_array_size]; ++ ++ if (copy_from_user(src_array, (void __user *)args->src_mem_range_array, ++ args->src_mem_array_size * ++ sizeof(struct kfd_memory_range))) { ++ err = -EFAULT; ++ goto copy_from_user_fail; ++ } ++ if (copy_from_user(dst_array, (void __user *)args->dst_mem_range_array, ++ args->dst_mem_array_size * ++ sizeof(struct kfd_memory_range))) { ++ err = -EFAULT; ++ goto copy_from_user_fail; ++ } ++ ++ /* Get remote process */ ++ remote_pid = find_get_pid(args->pid); ++ if (!remote_pid) { ++ pr_err("Cross mem copy failed. Invalid PID %d\n", args->pid); ++ err = -ESRCH; ++ goto copy_from_user_fail; ++ } ++ ++ remote_task = get_pid_task(remote_pid, PIDTYPE_PID); ++ if (!remote_pid) { ++ pr_err("Cross mem copy failed. Invalid PID or task died %d\n", ++ args->pid); ++ err = -ESRCH; ++ goto get_pid_task_fail; ++ } ++ ++ /* Check access permission */ ++ remote_mm = mm_access(remote_task, PTRACE_MODE_ATTACH_REALCREDS); ++ if (!remote_mm || IS_ERR(remote_mm)) { ++ err = IS_ERR(remote_mm) ? PTR_ERR(remote_mm) : -ESRCH; ++ if (err == -EACCES) { ++ pr_err("Cross mem copy failed. Permission error\n"); ++ err = -EPERM; ++ } else ++ pr_err("Cross mem copy failed. Invalid task %d\n", ++ err); ++ goto mm_access_fail; ++ } ++ ++ remote_p = kfd_get_process(remote_task); ++ if (!remote_p) { ++ pr_err("Cross mem copy failed. Invalid kfd process %d\n", ++ args->pid); ++ err = -EINVAL; ++ goto kfd_process_fail; ++ } ++ ++ if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { ++ src_p = local_p; ++ dst_p = remote_p; ++ cma_op = "WRITE"; ++ pr_debug("CMA WRITE: local -> remote\n"); ++ } else { ++ src_p = remote_p; ++ dst_p = local_p; ++ cma_op = "READ"; ++ pr_debug("CMA READ: remote -> local\n"); ++ } ++ ++ ++ /* For each source kfd_range: ++ * - Find the BO. Each range has to be within the same BO. ++ * - Copy this range to single or multiple destination BOs. ++ * - dst_va_addr - will point to next va address into which data will ++ * be copied. ++ * - dst_bo & src_bo - the current destination and source BOs ++ * - src_offset & dst_offset - offset into the respective BOs from ++ * data will be sourced or copied ++ */ ++ dst_va_addr = dst_array[0].va_addr; ++ dst_va_addr_end = dst_va_addr + dst_array[0].size - 1; ++ mutex_lock(&dst_p->mutex); ++ dst_bo = kfd_process_find_bo_from_interval(dst_p, ++ dst_va_addr, ++ dst_va_addr_end); ++ mutex_unlock(&dst_p->mutex); ++ if (!dst_bo || dst_va_addr_end > dst_bo->it.last) { ++ pr_err("CMA %s failed. Invalid dst range\n", cma_op); ++ err = -EFAULT; ++ goto kfd_process_fail; ++ } ++ dst_offset = dst_va_addr - dst_bo->it.start; ++ ++ for (i = 0; i < args->src_mem_array_size; i++) { ++ uint64_t src_va_addr_end = src_array[i].va_addr + ++ src_array[i].size - 1; ++ uint64_t src_size_to_copy = src_array[i].size; ++ ++ mutex_lock(&src_p->mutex); ++ src_bo = kfd_process_find_bo_from_interval(src_p, ++ src_array[i].va_addr, ++ src_va_addr_end); ++ mutex_unlock(&src_p->mutex); ++ if (!src_bo || src_va_addr_end > src_bo->it.last) { ++ pr_err("CMA %s failed. Invalid src range\n", cma_op); ++ err = -EFAULT; ++ break; ++ } ++ ++ src_offset = src_array[i].va_addr - src_bo->it.start; ++ ++ /* Copy src_bo to one or multiple dst_bo(s) based on size and ++ * and current copy location. ++ */ ++ while (j < args->dst_mem_array_size) { ++ uint64_t copy_size; ++ int64_t space_left; ++ ++ /* Find the current copy_size. This will be smaller of ++ * the following ++ * - space left in the current dest memory range ++ * - data left to copy from source range ++ */ ++ space_left = (dst_array[j].va_addr + dst_array[j].size) ++ - dst_va_addr; ++ copy_size = (src_size_to_copy < space_left) ? ++ src_size_to_copy : space_left; ++ ++ /* Check both BOs belong to same device */ ++ if (src_bo->dev->kgd != dst_bo->dev->kgd) { ++ pr_err("CMA %s fail. Not same dev\n", cma_op); ++ err = -EINVAL; ++ break; ++ } ++ ++ /* Store prev fence. Release it when a later fence is ++ * created ++ */ ++ lfence = fence; ++ fence = NULL; ++ ++ err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( ++ src_bo->dev->kgd, ++ src_bo->mem, src_offset, ++ dst_bo->mem, dst_offset, ++ copy_size, ++ &fence, &copied); ++ ++ if (err) { ++ pr_err("GPU CMA %s failed\n", cma_op); ++ err = -EFAULT; ++ break; ++ } ++ ++ /* Later fence available. Release old fence */ ++ if (fence && lfence) { ++ dma_fence_put(lfence); ++ lfence = NULL; ++ } ++ ++ total_copied += copied; ++ src_size_to_copy -= copied; ++ space_left -= copied; ++ dst_va_addr += copied; ++ dst_offset += copied; ++ src_offset += copied; ++ if (dst_va_addr > dst_bo->it.last + 1) { ++ pr_err("CMA %s fail. Mem overflow\n", cma_op); ++ err = -EFAULT; ++ break; ++ } ++ ++ /* If the cur dest range is full move to next one */ ++ if (space_left <= 0) { ++ if (++j >= args->dst_mem_array_size) ++ break; ++ ++ dst_va_addr = dst_array[j].va_addr; ++ dst_va_addr_end = dst_va_addr + ++ dst_array[j].size - 1; ++ dst_bo = kfd_process_find_bo_from_interval( ++ dst_p, ++ dst_va_addr, ++ dst_va_addr_end); ++ if (!dst_bo || ++ dst_va_addr_end > dst_bo->it.last) { ++ pr_err("CMA %s failed. Invalid dst range\n", ++ cma_op); ++ err = -EFAULT; ++ break; ++ } ++ dst_offset = dst_va_addr - dst_bo->it.start; ++ } ++ ++ /* If the cur src range is done, move to next one */ ++ if (src_size_to_copy <= 0) ++ break; ++ } ++ if (err) ++ break; ++ } ++ ++ /* Wait for the last fence irrespective of error condition */ ++ if (fence) { ++ if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) ++ < 0) ++ pr_err("CMA %s failed. BO timed out\n", cma_op); ++ dma_fence_put(fence); ++ } else if (lfence) { ++ pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); ++ dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); ++ dma_fence_put(lfence); ++ } ++ ++kfd_process_fail: ++ mmput(remote_mm); ++mm_access_fail: ++ put_task_struct(remote_task); ++get_pid_task_fail: ++ put_pid(remote_pid); ++copy_from_user_fail: ++ kfree(src_array); ++ ++ /* An error could happen after partial copy. In that case this will ++ * reflect partial amount of bytes copied ++ */ ++ args->bytes_copied = total_copied; ++ return err; ++} ++ ++static int kfd_ioctl_get_queue_wave_state(struct file *filep, ++ struct kfd_process *p, void *data) ++{ ++ struct kfd_ioctl_get_queue_wave_state_args *args = data; ++ int r; ++ ++ mutex_lock(&p->mutex); ++ ++ r = pqm_get_wave_state(&p->pqm, args->queue_id, ++ (void __user *)args->ctl_stack_address, ++ &args->ctl_stack_used_size, ++ &args->save_area_used_size); ++ ++ mutex_unlock(&p->mutex); ++ ++ return r; ++} ++ + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ + [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ + .cmd_drv = 0, .name = #ioctl} +@@ -1008,14 +1969,54 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, + kfd_ioctl_dbg_wave_control, 0), + +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA, +- kfd_ioctl_set_scratch_backing_va, 0), ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, ++ kfd_ioctl_alloc_memory_of_gpu, 0), + +- AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, +- kfd_ioctl_get_tile_config, 0), ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_FREE_MEMORY_OF_GPU, ++ kfd_ioctl_free_memory_of_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_MAP_MEMORY_TO_GPU, ++ kfd_ioctl_map_memory_to_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, ++ kfd_ioctl_unmap_memory_from_gpu, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, ++ kfd_ioctl_alloc_scratch_memory, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, ++ kfd_ioctl_set_cu_mask, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, ++ kfd_ioctl_set_process_dgpu_aperture, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, + kfd_ioctl_set_trap_handler, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, ++ kfd_ioctl_get_process_apertures_new, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, ++ kfd_ioctl_get_dmabuf_info, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, ++ kfd_ioctl_import_dmabuf, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, ++ kfd_ioctl_get_tile_config, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, ++ kfd_ioctl_ipc_import_handle, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_EXPORT_HANDLE, ++ kfd_ioctl_ipc_export_handle, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_CROSS_MEMORY_COPY, ++ kfd_ioctl_cross_memory_copy, 0), ++ ++ AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, ++ kfd_ioctl_get_queue_wave_state, 0) ++ + }; + + #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) +@@ -1111,23 +2112,34 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) + static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) + { + struct kfd_process *process; ++ struct kfd_dev *kfd; ++ unsigned long vm_pgoff; ++ unsigned long long mmap_type; + + process = kfd_get_process(current); + if (IS_ERR(process)) + return PTR_ERR(process); + +- if ((vma->vm_pgoff & KFD_MMAP_DOORBELL_MASK) == +- KFD_MMAP_DOORBELL_MASK) { +- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_DOORBELL_MASK; +- return kfd_doorbell_mmap(process, vma); +- } else if ((vma->vm_pgoff & KFD_MMAP_EVENTS_MASK) == +- KFD_MMAP_EVENTS_MASK) { +- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK; ++ vm_pgoff = vma->vm_pgoff; ++ vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); ++ mmap_type = vm_pgoff & KFD_MMAP_TYPE_MASK; ++ ++ switch (mmap_type) { ++ case KFD_MMAP_TYPE_DOORBELL: ++ kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); ++ if (!kfd) ++ return -EFAULT; ++ return kfd_doorbell_mmap(kfd, process, vma); ++ ++ case KFD_MMAP_TYPE_EVENTS: + return kfd_event_mmap(process, vma); +- } else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) == +- KFD_MMAP_RESERVED_MEM_MASK) { +- vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK; ++ ++ case KFD_MMAP_TYPE_RESERVED_MEM: + return kfd_reserved_mem_mmap(process, vma); ++ ++ default: ++ pr_err("Unsupported kfd mmap type %llx\n", mmap_type); ++ break; + } + + return -EFAULT; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +index 2bc2816..24d0634 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +@@ -1,31 +1,11 @@ +-/* +- * Copyright 2015-2017 Advanced Micro Devices, Inc. +- * +- * Permission is hereby granted, free of charge, to any person obtaining a +- * copy of this software and associated documentation files (the "Software"), +- * to deal in the Software without restriction, including without limitation +- * the rights to use, copy, modify, merge, publish, distribute, sublicense, +- * and/or sell copies of the Software, and to permit persons to whom the +- * Software is furnished to do so, subject to the following conditions: +- * +- * The above copyright notice and this permission notice shall be included in +- * all copies or substantial portions of the Software. +- * +- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +- * OTHER DEALINGS IN THE SOFTWARE. +- */ +- +-#include <linux/pci.h> ++#include <linux/kernel.h> + #include <linux/acpi.h> +-#include <linux/amd-iommu.h> ++#include <linux/mm.h> ++#include <linux/pci.h> + #include "kfd_crat.h" + #include "kfd_priv.h" + #include "kfd_topology.h" ++#include "kfd_iommu.h" + + /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. + * GPU processor ID are expressed with Bit[31]=1. +@@ -132,6 +112,9 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { + #define fiji_cache_info carrizo_cache_info + #define polaris10_cache_info carrizo_cache_info + #define polaris11_cache_info carrizo_cache_info ++/* TODO - check & update Vega10 cache details */ ++#define vega10_cache_info carrizo_cache_info ++#define raven_cache_info carrizo_cache_info + + static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) +@@ -186,6 +169,21 @@ static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, + return 0; + } + ++static struct kfd_mem_properties * ++find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, ++ struct kfd_topology_device *dev) ++{ ++ struct kfd_mem_properties *props; ++ ++ list_for_each_entry(props, &dev->mem_props, list) { ++ if (props->heap_type == heap_type ++ && props->flags == flags ++ && props->width == width) ++ return props; ++ } ++ ++ return NULL; ++} + /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct + * topology device present in the device_list + */ +@@ -194,36 +192,56 @@ static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, + { + struct kfd_mem_properties *props; + struct kfd_topology_device *dev; ++ uint32_t heap_type; ++ uint64_t size_in_bytes; ++ uint32_t flags = 0; ++ uint32_t width; + + pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n", + mem->proximity_domain); + list_for_each_entry(dev, device_list, list) { + if (mem->proximity_domain == dev->proximity_domain) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- + /* We're on GPU node */ + if (dev->node_props.cpu_cores_count == 0) { + /* APU */ + if (mem->visibility_type == 0) +- props->heap_type = ++ heap_type = + HSA_MEM_HEAP_TYPE_FB_PRIVATE; + /* dGPU */ + else +- props->heap_type = mem->visibility_type; ++ heap_type = mem->visibility_type; + } else +- props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; ++ heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) +- props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; ++ flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; + if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) +- props->flags |= HSA_MEM_FLAGS_NON_VOLATILE; ++ flags |= HSA_MEM_FLAGS_NON_VOLATILE; + +- props->size_in_bytes = ++ size_in_bytes = + ((uint64_t)mem->length_high << 32) + + mem->length_low; +- props->width = mem->width; ++ width = mem->width; ++ ++ /* Multiple banks of the same type are aggregated into ++ * one. User mode doesn't care about multiple physical ++ * memory segments. It's managed as a single virtual ++ * heap for user mode. ++ */ ++ props = find_subtype_mem(heap_type, flags, width, dev); ++ if (props) { ++ props->size_in_bytes += size_in_bytes; ++ break; ++ } ++ ++ props = kfd_alloc_struct(props); ++ if (!props) ++ return -ENOMEM; ++ ++ props->heap_type = heap_type; ++ props->flags = flags; ++ props->size_in_bytes = size_in_bytes; ++ props->width = width; + + dev->node_props.mem_banks_count++; + list_add_tail(&props->list, &dev->mem_props); +@@ -248,7 +266,6 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + + id = cache->processor_id_low; + +- pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); + list_for_each_entry(dev, device_list, list) { + total_num_of_cu = (dev->node_props.array_count * + dev->node_props.cu_per_simd_array); +@@ -398,15 +415,11 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, + ret = kfd_parse_subtype_cache(cache, device_list); + break; + case CRAT_SUBTYPE_TLB_AFFINITY: +- /* +- * For now, nothing to do here +- */ ++ /* For now, nothing to do here */ + pr_debug("Found TLB entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: +- /* +- * For now, nothing to do here +- */ ++ /* For now, nothing to do here */ + pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); + break; + case CRAT_SUBTYPE_IOLINK_AFFINITY: +@@ -431,8 +444,9 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, + * + * Return - 0 if successful else -ve value + */ +-int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, +- uint32_t proximity_domain) ++int kfd_parse_crat_table(void *crat_image, ++ struct list_head *device_list, ++ uint32_t proximity_domain) + { + struct kfd_topology_device *top_dev = NULL; + struct crat_subtype_generic *sub_type_hdr; +@@ -603,6 +617,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + pcache_info = polaris11_cache_info; + num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); + break; ++ case CHIP_VEGA10: ++ pcache_info = vega10_cache_info; ++ num_of_cache_types = ARRAY_SIZE(vega10_cache_info); ++ break; ++ case CHIP_RAVEN: ++ pcache_info = raven_cache_info; ++ num_of_cache_types = ARRAY_SIZE(raven_cache_info); ++ break; + default: + return -EINVAL; + } +@@ -671,8 +693,9 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, + * crat_image will be NULL + * @size: [OUT] size of crat_image + * +- * Return 0 if successful else return error code ++ * Return 0 if successful else return -ve value + */ ++#ifdef CONFIG_ACPI + int kfd_create_crat_image_acpi(void **crat_image, size_t *size) + { + struct acpi_table_header *crat_table; +@@ -702,8 +725,10 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) + } + + pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); +- if (!pcrat_image) ++ if (!pcrat_image) { ++ pr_err("No memory for allocating CRAT image\n"); + return -ENOMEM; ++ } + + memcpy(pcrat_image, crat_table, crat_table->length); + +@@ -712,6 +737,7 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) + + return 0; + } ++#endif + + /* Memory required to create Virtual CRAT. + * Since there is no easy way to predict the amount of memory required, the +@@ -806,6 +832,7 @@ static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, + return 0; + } + ++#ifdef CONFIG_X86_64 + static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, + uint32_t *num_entries, + struct crat_subtype_iolink *sub_type_hdr) +@@ -848,6 +875,7 @@ static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, + + return 0; + } ++#endif + + /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU + * +@@ -858,13 +886,17 @@ static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, + static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + { + struct crat_header *crat_table = (struct crat_header *)pcrat_image; +- struct acpi_table_header *acpi_table; +- acpi_status status; + struct crat_subtype_generic *sub_type_hdr; + int avail_size = *size; + int numa_node_id; +- uint32_t entries = 0; + int ret = 0; ++#ifdef CONFIG_ACPI ++ struct acpi_table_header *acpi_table; ++ acpi_status status; ++#endif ++#ifdef CONFIG_X86_64 ++ uint32_t entries = 0; ++#endif + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU) + return -EINVAL; +@@ -881,6 +913,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + sizeof(crat_table->signature)); + crat_table->length = sizeof(struct crat_header); + ++#ifdef CONFIG_ACPI + status = acpi_get_table("DSDT", 0, &acpi_table); + if (status == AE_NOT_FOUND) + pr_warn("DSDT table not found for OEM information\n"); +@@ -891,6 +924,11 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, + CRAT_OEMTABLEID_LENGTH); + } ++#else ++ crat_table->oem_revision = 0; ++ memcpy(crat_table->oem_id, "INV", CRAT_OEMID_LENGTH); ++ memcpy(crat_table->oem_table_id, "UNAVAIL", CRAT_OEMTABLEID_LENGTH); ++#endif + crat_table->total_entries = 0; + crat_table->num_domains = 0; + +@@ -925,6 +963,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + sub_type_hdr->length); + + /* Fill in Subtype: IO Link */ ++#ifdef CONFIG_X86_64 + ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size, + &entries, + (struct crat_subtype_iolink *)sub_type_hdr); +@@ -935,6 +974,9 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length * entries); ++#else ++ pr_info("IO link not available for non x86 platforms\n"); ++#endif + + crat_table->num_domains++; + } +@@ -1030,22 +1072,18 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size, + * [OUT] actual size of data filled in crat_image + */ + static int kfd_create_vcrat_image_gpu(void *pcrat_image, +- size_t *size, struct kfd_dev *kdev, +- uint32_t proximity_domain) ++ size_t *size, struct kfd_dev *kdev, ++ uint32_t proximity_domain) + { + struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct crat_subtype_generic *sub_type_hdr; + struct crat_subtype_computeunit *cu; + struct kfd_cu_info cu_info; +- struct amd_iommu_device_info iommu_info; + int avail_size = *size; + uint32_t total_num_of_cu; + int num_of_cache_entries = 0; + int cache_mem_filled = 0; + int ret = 0; +- const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | +- AMD_IOMMU_DEVICE_FLAG_PRI_SUP | +- AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + struct kfd_local_mem_info local_mem_info; + + if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) +@@ -1106,12 +1144,8 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, + /* Check if this node supports IOMMU. During parsing this flag will + * translate to HSA_CAP_ATS_PRESENT + */ +- iommu_info.flags = 0; +- if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { +- if ((iommu_info.flags & required_iommu_flags) == +- required_iommu_flags) +- cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; +- } ++ if (!kfd_iommu_check_device(kdev)) ++ cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; + + crat_table->length += sub_type_hdr->length; + crat_table->total_entries++; +@@ -1125,6 +1159,9 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length); + ++ if (debug_largebar) ++ local_mem_info.local_mem_size_private = 0; ++ + if (local_mem_info.local_mem_size_private == 0) + ret = kfd_fill_gpu_memory_affinity(&avail_size, + kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, +@@ -1204,8 +1241,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, + * Return 0 if successful else return -ve value + */ + int kfd_create_crat_image_virtual(void **crat_image, size_t *size, +- int flags, struct kfd_dev *kdev, +- uint32_t proximity_domain) ++ int flags, struct kfd_dev *kdev, uint32_t proximity_domain) + { + void *pcrat_image = NULL; + int ret = 0; +@@ -1235,8 +1271,8 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + if (!pcrat_image) + return -ENOMEM; + *size = VCRAT_SIZE_FOR_GPU; +- ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, +- proximity_domain); ++ ret = kfd_create_vcrat_image_gpu(pcrat_image, size, ++ kdev, proximity_domain); + break; + case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): + /* TODO: */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +index b5cd182..00de41f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +@@ -24,6 +24,7 @@ + #define KFD_CRAT_H_INCLUDED + + #include <linux/types.h> ++#include "kfd_priv.h" + + #pragma pack(1) + +@@ -227,12 +228,12 @@ struct crat_subtype_ccompute { + /* + * HSA IO Link Affinity structure and definitions + */ +-#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +-#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +-#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +-#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +-#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 ++#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) ++#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) ++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) ++#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) ++#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) ++#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 + + /* + * IO interface types +@@ -240,18 +241,18 @@ struct crat_subtype_ccompute { + #define CRAT_IOLINK_TYPE_UNDEFINED 0 + #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 + #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 +-#define CRAT_IOLINK_TYPE_AMBA 3 +-#define CRAT_IOLINK_TYPE_MIPI 4 +-#define CRAT_IOLINK_TYPE_QPI_1_1 5 +-#define CRAT_IOLINK_TYPE_RESERVED1 6 +-#define CRAT_IOLINK_TYPE_RESERVED2 7 +-#define CRAT_IOLINK_TYPE_RAPID_IO 8 +-#define CRAT_IOLINK_TYPE_INFINIBAND 9 +-#define CRAT_IOLINK_TYPE_RESERVED3 10 +-#define CRAT_IOLINK_TYPE_OTHER 11 +-#define CRAT_IOLINK_TYPE_MAX 255 +- +-#define CRAT_IOLINK_RESERVED_LENGTH 24 ++#define CRAT_IOLINK_TYPE_AMBA 3 ++#define CRAT_IOLINK_TYPE_MIPI 4 ++#define CRAT_IOLINK_TYPE_QPI_1_1 5 ++#define CRAT_IOLINK_TYPE_RESERVED1 6 ++#define CRAT_IOLINK_TYPE_RESERVED2 7 ++#define CRAT_IOLINK_TYPE_RAPID_IO 8 ++#define CRAT_IOLINK_TYPE_INFINIBAND 9 ++#define CRAT_IOLINK_TYPE_RESERVED3 10 ++#define CRAT_IOLINK_TYPE_OTHER 11 ++#define CRAT_IOLINK_TYPE_MAX 255 ++ ++#define CRAT_IOLINK_RESERVED_LENGTH 24 + + struct crat_subtype_iolink { + uint8_t type; +@@ -307,14 +308,13 @@ struct cdit_header { + + #pragma pack() + +-struct kfd_dev; +- ++#ifdef CONFIG_ACPI + int kfd_create_crat_image_acpi(void **crat_image, size_t *size); ++#endif + void kfd_destroy_crat_image(void *crat_image); +-int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, +- uint32_t proximity_domain); ++int kfd_parse_crat_table(void *crat_image, ++ struct list_head *device_list, ++ uint32_t proximity_domain); + int kfd_create_crat_image_virtual(void **crat_image, size_t *size, +- int flags, struct kfd_dev *kdev, +- uint32_t proximity_domain); +- ++ int flags, struct kfd_dev *kdev, uint32_t proximity_domain); + #endif /* KFD_CRAT_H_INCLUDED */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +index afb26f2..8d85e28 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +@@ -29,7 +29,7 @@ + #include <linux/mutex.h> + #include <linux/device.h> + +-#include "kfd_pm4_headers.h" ++#include "kfd_pm4_headers_vi.h" + #include "kfd_pm4_headers_diq.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" +@@ -47,9 +47,10 @@ static void dbgdev_address_watch_disable_nodiq(struct kfd_dev *dev) + + static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + unsigned int pasid, uint64_t vmid0_address, +- uint32_t *packet_buff, size_t size_in_bytes) ++ uint32_t *packet_buff, size_t size_in_bytes, ++ bool sync) + { +- struct pm4__release_mem *rm_packet; ++ struct pm4_mec_release_mem *rm_packet; + struct pm4__indirect_buffer_pasid *ib_packet; + struct kfd_mem_obj *mem_obj; + size_t pq_packets_size_in_bytes; +@@ -65,8 +66,9 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + kq = dbgdev->kq; + +- pq_packets_size_in_bytes = sizeof(struct pm4__release_mem) + +- sizeof(struct pm4__indirect_buffer_pasid); ++ pq_packets_size_in_bytes = sizeof(struct pm4__indirect_buffer_pasid); ++ if (sync) ++ pq_packets_size_in_bytes += sizeof(struct pm4_mec_release_mem); + + /* + * We acquire a buffer from DIQ +@@ -99,6 +101,11 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + ib_packet->bitfields5.pasid = pasid; + ++ if (!sync) { ++ kq->ops.submit_packet(kq); ++ return status; ++ } ++ + /* + * for now we use release mem for GPU-CPU synchronization + * Consider WaitRegMem + WriteData as a better alternative +@@ -107,7 +114,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + * (a) Sync with HW + * (b) Sync var is written by CP to mem. + */ +- rm_packet = (struct pm4__release_mem *) (ib_packet_buff + ++ rm_packet = (struct pm4_mec_release_mem *) (ib_packet_buff + + (sizeof(struct pm4__indirect_buffer_pasid) / + sizeof(unsigned int))); + +@@ -126,7 +133,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev, + + rm_packet->header.opcode = IT_RELEASE_MEM; + rm_packet->header.type = PM4_TYPE_3; +- rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2; ++ rm_packet->header.count = sizeof(struct pm4_mec_release_mem) / 4 - 2; + + rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + rm_packet->bitfields2.event_index = +@@ -184,7 +191,6 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev) + int status; + + properties.type = KFD_QUEUE_TYPE_DIQ; +- + status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL, + &properties, &qid); + +@@ -232,7 +238,8 @@ static void dbgdev_address_watch_set_registers( + union TCP_WATCH_ADDR_H_BITS *addrHi, + union TCP_WATCH_ADDR_L_BITS *addrLo, + union TCP_WATCH_CNTL_BITS *cntl, +- unsigned int index, unsigned int vmid) ++ unsigned int index, unsigned int vmid, ++ bool is_apu) + { + union ULARGE_INTEGER addr; + +@@ -257,9 +264,9 @@ static void dbgdev_address_watch_set_registers( + + cntl->bitfields.mode = adw_info->watch_mode[index]; + cntl->bitfields.vmid = (uint32_t) vmid; +- /* for now assume it is an ATC address */ +- cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; +- ++ /* for APU assume it is an ATC address */ ++ if (is_apu) ++ cntl->u32All |= ADDRESS_WATCH_REG_CNTL_ATC_BIT; + pr_debug("\t\t%20s %08x\n", "set reg mask :", cntl->bitfields.mask); + pr_debug("\t\t%20s %08x\n", "set reg add high :", + addrHi->bitfields.addr); +@@ -301,7 +308,8 @@ static int dbgdev_address_watch_nodiq(struct kfd_dbgdev *dbgdev, + + for (i = 0; i < adw_info->num_watch_points; i++) { + dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, +- &cntl, i, pdd->qpd.vmid); ++ &cntl, i, pdd->qpd.vmid, ++ dbgdev->dev->device_info->needs_iommu_device); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); +@@ -340,9 +348,9 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + union TCP_WATCH_ADDR_H_BITS addrHi; + union TCP_WATCH_ADDR_L_BITS addrLo; + union TCP_WATCH_CNTL_BITS cntl; +- struct kfd_mem_obj *mem_obj; + unsigned int aw_reg_add_dword; + uint32_t *packet_buff_uint; ++ uint64_t packet_buff_gpu_addr; + unsigned int i; + int status; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 4; +@@ -364,15 +372,13 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + return -EINVAL; + } + +- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); +- ++ status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, ++ ib_size/sizeof(uint32_t), ++ &packet_buff_uint, &packet_buff_gpu_addr); + if (status) { +- pr_err("Failed to allocate GART memory\n"); ++ pr_err("Failed to allocate IB from DIQ ring\n"); + return status; + } +- +- packet_buff_uint = mem_obj->cpu_ptr; +- + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) (packet_buff_uint); +@@ -391,12 +397,9 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + packets_vec[3].bitfields2.insert_vmid = 1; + + for (i = 0; i < adw_info->num_watch_points; i++) { +- dbgdev_address_watch_set_registers(adw_info, +- &addrHi, +- &addrLo, +- &cntl, +- i, +- vmid); ++ dbgdev_address_watch_set_registers(adw_info, &addrHi, &addrLo, ++ &cntl, i, vmid, ++ dbgdev->dev->device_info->needs_iommu_device); + + pr_debug("\t\t%30s\n", "* * * * * * * * * * * * * * * * * *"); + pr_debug("\t\t%20s %08x\n", "register index :", i); +@@ -469,24 +472,24 @@ static int dbgdev_address_watch_diq(struct kfd_dbgdev *dbgdev, + status = dbgdev_diq_submit_ib( + dbgdev, + adw_info->process->pasid, +- mem_obj->gpu_addr, ++ packet_buff_gpu_addr, + packet_buff_uint, +- ib_size); ++ ib_size, true); + + if (status) { + pr_err("Failed to submit IB to DIQ\n"); +- break; ++ return status; + } + } + +- kfd_gtt_sa_free(dbgdev->dev, mem_obj); + return status; + } + + static int dbgdev_wave_control_set_registers( + struct dbg_wave_control_info *wac_info, + union SQ_CMD_BITS *in_reg_sq_cmd, +- union GRBM_GFX_INDEX_BITS *in_reg_gfx_index) ++ union GRBM_GFX_INDEX_BITS *in_reg_gfx_index, ++ unsigned int asic_family) + { + int status = 0; + union SQ_CMD_BITS reg_sq_cmd; +@@ -544,11 +547,25 @@ static int dbgdev_wave_control_set_registers( + + switch (wac_info->operand) { + case HSA_DBG_WAVEOP_HALT: +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; ++ if (asic_family == CHIP_KAVERI) { ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_HALT; ++ pr_debug("Halting KV\n"); ++ } else { ++ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; ++ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_HALT; ++ pr_debug("Halting CZ\n"); ++ } + break; + + case HSA_DBG_WAVEOP_RESUME: +- reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; ++ if (asic_family == CHIP_KAVERI) { ++ reg_sq_cmd.bits.cmd = SQ_IND_CMD_CMD_RESUME; ++ pr_debug("Resuming KV\n"); ++ } else { ++ reg_sq_cmd.bits_sethalt.cmd = SQ_IND_CMD_NEW_SETHALT; ++ reg_sq_cmd.bits_sethalt.data = SQ_IND_CMD_DATA_RESUME; ++ pr_debug("Resuming CZ\n"); ++ } + break; + + case HSA_DBG_WAVEOP_KILL: +@@ -588,15 +605,15 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + int status; + union SQ_CMD_BITS reg_sq_cmd; + union GRBM_GFX_INDEX_BITS reg_gfx_index; +- struct kfd_mem_obj *mem_obj; + uint32_t *packet_buff_uint; ++ uint64_t packet_buff_gpu_addr; + struct pm4__set_config_reg *packets_vec; + size_t ib_size = sizeof(struct pm4__set_config_reg) * 3; + + reg_sq_cmd.u32All = 0; + + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index); ++ ®_gfx_index, dbgdev->dev->device_info->asic_family); + if (status) { + pr_err("Failed to set wave control registers\n"); + return status; +@@ -635,15 +652,13 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + + pr_debug("\t\t %30s\n", "* * * * * * * * * * * * * * * * * *"); + +- status = kfd_gtt_sa_allocate(dbgdev->dev, ib_size, &mem_obj); +- +- if (status != 0) { +- pr_err("Failed to allocate GART memory\n"); ++ status = dbgdev->kq->ops.acquire_inline_ib(dbgdev->kq, ++ ib_size / sizeof(uint32_t), ++ &packet_buff_uint, &packet_buff_gpu_addr); ++ if (status) { ++ pr_err("Failed to allocate IB from DIQ ring\n"); + return status; + } +- +- packet_buff_uint = mem_obj->cpu_ptr; +- + memset(packet_buff_uint, 0, ib_size); + + packets_vec = (struct pm4__set_config_reg *) packet_buff_uint; +@@ -683,15 +698,13 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev, + status = dbgdev_diq_submit_ib( + dbgdev, + wac_info->process->pasid, +- mem_obj->gpu_addr, ++ packet_buff_gpu_addr, + packet_buff_uint, +- ib_size); ++ ib_size, false); + + if (status) + pr_err("Failed to submit IB to DIQ\n"); + +- kfd_gtt_sa_free(dbgdev->dev, mem_obj); +- + return status; + } + +@@ -713,7 +726,7 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev, + return -EFAULT; + } + status = dbgdev_wave_control_set_registers(wac_info, ®_sq_cmd, +- ®_gfx_index); ++ ®_gfx_index, dbgdev->dev->device_info->asic_family); + if (status) { + pr_err("Failed to set wave control registers\n"); + return status; +@@ -805,7 +818,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) + return -EFAULT; + + status = dbgdev_wave_control_set_registers(&wac_info, ®_sq_cmd, +- ®_gfx_index); ++ ®_gfx_index, dev->device_info->asic_family); + if (status != 0) + return -EINVAL; + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +index 03424c2..583aaa9 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.h +@@ -60,6 +60,24 @@ enum { + SH_REG_SIZE = SH_REG_END - SH_REG_BASE + }; + ++/* SQ_CMD definitions */ ++ ++enum { ++ SQ_IND_CMD_DATA_RESUME = 0, ++ SQ_IND_CMD_DATA_HALT = 1 ++}; ++ ++enum SQ_IND_CMD_NEW { ++ SQ_IND_CMD_NEW_NULL = 0x00000000, ++ SQ_IND_CMD_NEW_SETHALT = 0x00000001, ++ SQ_IND_CMD_NEW_SAVECTX = 0x00000002, ++ SQ_IND_CMD_NEW_KILL = 0x00000003, ++ SQ_IND_CMD_NEW_DEBUG = 0x00000004, ++ SQ_IND_CMD_NEW_TRAP = 0x00000005, ++ SQ_IND_CMD_NEW_SET_PRIO = 0x00000006 ++ ++}; ++ + enum SQ_IND_CMD_CMD { + SQ_IND_CMD_CMD_NULL = 0x00000000, + SQ_IND_CMD_CMD_HALT = 0x00000001, +@@ -118,6 +136,20 @@ union SQ_CMD_BITS { + uint32_t:1; + uint32_t vm_id:4; + } bitfields, bits; ++ struct { ++ uint32_t cmd:3; ++ uint32_t:1; ++ uint32_t mode:3; ++ uint32_t check_vmid:1; ++ uint32_t data:3; ++ uint32_t:5; ++ uint32_t wave_id:4; ++ uint32_t simd_id:2; ++ uint32_t:2; ++ uint32_t queue_id:3; ++ uint32_t:1; ++ uint32_t vm_id:4; ++ } bitfields_sethalt, bits_sethalt; + uint32_t u32All; + signed int i32All; + float f32All; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +index 3da25f7..9d4af96 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +@@ -33,6 +33,7 @@ + #include "kfd_pm4_headers_diq.h" + #include "kfd_dbgmgr.h" + #include "kfd_dbgdev.h" ++#include "kfd_device_queue_manager.h" + + static DEFINE_MUTEX(kfd_dbgmgr_mutex); + +@@ -83,7 +84,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) + } + + /* get actual type of DBGDevice cpsch or not */ +- if (sched_policy == KFD_SCHED_POLICY_NO_HWS) ++ if (pdev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) + type = DBGDEV_TYPE_NODIQ; + + kfd_dbgdev_init(new_buff->dbgdev, pdev, type); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +index 4bd6ebf..232e28f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +@@ -1,5 +1,5 @@ + /* +- * Copyright 2016-2017 Advanced Micro Devices, Inc. ++ * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +index a8fa33a..74fd3b2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c +@@ -20,27 +20,35 @@ + * OTHER DEALINGS IN THE SOFTWARE. + */ + +-#include <linux/amd-iommu.h> + #include <linux/bsearch.h> + #include <linux/pci.h> + #include <linux/slab.h> ++#include <linux/dma-fence.h> + #include "kfd_priv.h" + #include "kfd_device_queue_manager.h" + #include "kfd_pm4_headers_vi.h" + #include "cwsr_trap_handler_gfx8.asm" ++#include "cwsr_trap_handler_gfx9.asm" ++#include "kfd_iommu.h" + + #define MQD_SIZE_ALIGNED 768 ++static atomic_t kfd_device_suspended = ATOMIC_INIT(0); + ++#ifdef KFD_SUPPORT_IOMMU_V2 + static const struct kfd_device_info kaveri_device_info = { + .asic_family = CHIP_KAVERI, + .max_pasid_bits = 16, + /* max num of queues for KV.TODO should be a dynamic value */ + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = false, ++ .needs_iommu_device = true, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, + }; + + static const struct kfd_device_info carrizo_device_info = { +@@ -48,11 +56,183 @@ static const struct kfd_device_info carrizo_device_info = { + .max_pasid_bits = 16, + /* max num of queues for CZ.TODO should be a dynamic value */ + .max_no_of_hqd = 24, ++ .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, ++ .needs_iommu_device = true, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info raven_device_info = { ++ .asic_family = CHIP_RAVEN, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 8, ++ .ih_ring_entry_size = 8 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_v9, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = true, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 1, ++}; ++#endif ++ ++static const struct kfd_device_info hawaii_device_info = { ++ .asic_family = CHIP_HAWAII, ++ .max_pasid_bits = 16, ++ /* max num of queues for KV.TODO should be a dynamic value */ ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = false, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info tonga_device_info = { ++ .asic_family = CHIP_TONGA, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = false, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info tonga_vf_device_info = { ++ .asic_family = CHIP_TONGA, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = false, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info fiji_device_info = { ++ .asic_family = CHIP_FIJI, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info fiji_vf_device_info = { ++ .asic_family = CHIP_FIJI, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, ++}; ++ ++ ++static const struct kfd_device_info polaris10_device_info = { ++ .asic_family = CHIP_POLARIS10, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info polaris10_vf_device_info = { ++ .asic_family = CHIP_POLARIS10, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info polaris11_device_info = { ++ .asic_family = CHIP_POLARIS11, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 4, ++ .ih_ring_entry_size = 4 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_cik, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info vega10_device_info = { ++ .asic_family = CHIP_VEGA10, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 8, ++ .ih_ring_entry_size = 8 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_v9, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = true, ++ .num_sdma_engines = 2, ++}; ++ ++static const struct kfd_device_info vega10_vf_device_info = { ++ .asic_family = CHIP_VEGA10, ++ .max_pasid_bits = 16, ++ .max_no_of_hqd = 24, ++ .doorbell_size = 8, ++ .ih_ring_entry_size = 8 * sizeof(uint32_t), ++ .event_interrupt_class = &event_interrupt_class_v9, ++ .num_of_watch_points = 4, ++ .mqd_size_aligned = MQD_SIZE_ALIGNED, ++ .supports_cwsr = true, ++ .needs_iommu_device = false, ++ .needs_pci_atomics = false, ++ .num_sdma_engines = 2, + }; + + struct kfd_deviceid { +@@ -60,8 +240,8 @@ struct kfd_deviceid { + const struct kfd_device_info *device_info; + }; + +-/* Please keep this sorted by increasing device id. */ + static const struct kfd_deviceid supported_devices[] = { ++#ifdef KFD_SUPPORT_IOMMU_V2 + { 0x1304, &kaveri_device_info }, /* Kaveri */ + { 0x1305, &kaveri_device_info }, /* Kaveri */ + { 0x1306, &kaveri_device_info }, /* Kaveri */ +@@ -88,7 +268,61 @@ static const struct kfd_deviceid supported_devices[] = { + { 0x9874, &carrizo_device_info }, /* Carrizo */ + { 0x9875, &carrizo_device_info }, /* Carrizo */ + { 0x9876, &carrizo_device_info }, /* Carrizo */ +- { 0x9877, &carrizo_device_info } /* Carrizo */ ++ { 0x9877, &carrizo_device_info }, /* Carrizo */ ++ { 0x15DD, &raven_device_info }, /* Raven */ ++#endif ++ { 0x67A0, &hawaii_device_info }, /* Hawaii */ ++ { 0x67A1, &hawaii_device_info }, /* Hawaii */ ++ { 0x67A2, &hawaii_device_info }, /* Hawaii */ ++ { 0x67A8, &hawaii_device_info }, /* Hawaii */ ++ { 0x67A9, &hawaii_device_info }, /* Hawaii */ ++ { 0x67AA, &hawaii_device_info }, /* Hawaii */ ++ { 0x67B0, &hawaii_device_info }, /* Hawaii */ ++ { 0x67B1, &hawaii_device_info }, /* Hawaii */ ++ { 0x67B8, &hawaii_device_info }, /* Hawaii */ ++ { 0x67B9, &hawaii_device_info }, /* Hawaii */ ++ { 0x67BA, &hawaii_device_info }, /* Hawaii */ ++ { 0x67BE, &hawaii_device_info }, /* Hawaii */ ++ { 0x6920, &tonga_device_info }, /* Tonga */ ++ { 0x6921, &tonga_device_info }, /* Tonga */ ++ { 0x6928, &tonga_device_info }, /* Tonga */ ++ { 0x6929, &tonga_device_info }, /* Tonga */ ++ { 0x692B, &tonga_device_info }, /* Tonga */ ++ { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ ++ { 0x6938, &tonga_device_info }, /* Tonga */ ++ { 0x6939, &tonga_device_info }, /* Tonga */ ++ { 0x7300, &fiji_device_info }, /* Fiji */ ++ { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ ++ { 0x67C0, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C1, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C2, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C4, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C7, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C8, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67C9, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67CA, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67CC, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67CF, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ ++ { 0x67DF, &polaris10_device_info }, /* Polaris10 */ ++ { 0x67E0, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67E1, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67E3, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67E7, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67E8, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67E9, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67EB, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67EF, &polaris11_device_info }, /* Polaris11 */ ++ { 0x67FF, &polaris11_device_info }, /* Polaris11 */ ++ { 0x6860, &vega10_device_info }, /* Vega10 */ ++ { 0x6861, &vega10_device_info }, /* Vega10 */ ++ { 0x6862, &vega10_device_info }, /* Vega10 */ ++ { 0x6863, &vega10_device_info }, /* Vega10 */ ++ { 0x6864, &vega10_device_info }, /* Vega10 */ ++ { 0x6867, &vega10_device_info }, /* Vega10 */ ++ { 0x6868, &vega10_device_info }, /* Vega10 */ ++ { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ ++ { 0x687F, &vega10_device_info }, /* Vega10 */ + }; + + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, +@@ -127,6 +361,21 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + return NULL; + } + ++ if (device_info->needs_pci_atomics) { ++ /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. ++ * 32 and 64-bit requests are possible and must be ++ * supported. ++ */ ++ if (pci_enable_atomic_ops_to_root(pdev, ++ PCI_EXP_DEVCAP2_ATOMIC_COMP32 | ++ PCI_EXP_DEVCAP2_ATOMIC_COMP64) < 0) { ++ dev_info(kfd_device, ++ "skipped device %x:%x, PCI rejects atomics", ++ pdev->vendor, pdev->device); ++ return NULL; ++ } ++ } ++ + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); + if (!kfd) + return NULL; +@@ -144,84 +393,19 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + return kfd; + } + +-static bool device_iommu_pasid_init(struct kfd_dev *kfd) +-{ +- const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | +- AMD_IOMMU_DEVICE_FLAG_PRI_SUP | +- AMD_IOMMU_DEVICE_FLAG_PASID_SUP; +- +- struct amd_iommu_device_info iommu_info; +- unsigned int pasid_limit; +- int err; +- +- err = amd_iommu_device_info(kfd->pdev, &iommu_info); +- if (err < 0) { +- dev_err(kfd_device, +- "error getting iommu info. is the iommu enabled?\n"); +- return false; +- } +- +- if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { +- dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", +- (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, +- (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, +- (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) +- != 0); +- return false; +- } +- +- pasid_limit = min_t(unsigned int, +- (unsigned int)(1 << kfd->device_info->max_pasid_bits), +- iommu_info.max_pasids); +- +- if (!kfd_set_pasid_limit(pasid_limit)) { +- dev_err(kfd_device, "error setting pasid limit\n"); +- return false; +- } +- +- return true; +-} +- +-static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) +-{ +- struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); +- +- if (dev) +- kfd_process_iommu_unbind_callback(dev, pasid); +-} +- +-/* +- * This function called by IOMMU driver on PPR failure +- */ +-static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, +- unsigned long address, u16 flags) +-{ +- struct kfd_dev *dev; +- +- dev_warn(kfd_device, +- "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X", +- PCI_BUS_NUM(pdev->devfn), +- PCI_SLOT(pdev->devfn), +- PCI_FUNC(pdev->devfn), +- pasid, +- address, +- flags); +- +- dev = kfd_device_by_pci_dev(pdev); +- if (!WARN_ON(!dev)) +- kfd_signal_iommu_event(dev, pasid, address, +- flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); +- +- return AMD_IOMMU_INV_PRI_RSP_INVALID; +-} +- + static void kfd_cwsr_init(struct kfd_dev *kfd) + { + if (cwsr_enable && kfd->device_info->supports_cwsr) { +- BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); ++ if (kfd->device_info->asic_family < CHIP_VEGA10) { ++ BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); ++ kfd->cwsr_isa = cwsr_trap_gfx8_hex; ++ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); ++ } else { ++ BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); ++ kfd->cwsr_isa = cwsr_trap_gfx9_hex; ++ kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); ++ } + +- kfd->cwsr_isa = cwsr_trap_gfx8_hex; +- kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); + kfd->cwsr_enabled = true; + } + } +@@ -231,8 +415,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + { + unsigned int size; + ++ kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, ++ KGD_ENGINE_MEC1); ++ + kfd->shared_resources = *gpu_resources; + ++ /* Usually first_vmid_kfd = 8, last_vmid_kfd = 15 */ + kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1; + kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; + kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd +@@ -304,11 +492,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + goto device_queue_manager_error; + } + +- if (!device_iommu_pasid_init(kfd)) { +- dev_err(kfd_device, +- "Error initializing iommuv2 for device %x:%x\n", +- kfd->pdev->vendor, kfd->pdev->device); +- goto device_iommu_pasid_error; ++ if (kfd_iommu_device_init(kfd)) { ++ dev_err(kfd_device, "Error initializing iommuv2\n"); ++ goto device_iommu_error; + } + + kfd_cwsr_init(kfd); +@@ -323,12 +509,12 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, + kfd->pdev->device); + + pr_debug("Starting kfd with the following scheduling policy %d\n", +- sched_policy); ++ kfd->dqm->sched_policy); + + goto out; + + kfd_resume_error: +-device_iommu_pasid_error: ++device_iommu_error: + device_queue_manager_uninit(kfd->dqm); + device_queue_manager_error: + kfd_interrupt_exit(kfd); +@@ -362,45 +548,60 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) + kfree(kfd); + } + ++int kgd2kfd_pre_reset(struct kfd_dev *kfd) ++{ ++ return 0; ++} ++ ++int kgd2kfd_post_reset(struct kfd_dev *kfd) ++{ ++ return 0; ++} ++ + void kgd2kfd_suspend(struct kfd_dev *kfd) + { + if (!kfd->init_complete) + return; + +- kfd->dqm->ops.stop(kfd->dqm); ++ /* For first KFD device suspend all the KFD processes */ ++ if (atomic_inc_return(&kfd_device_suspended) == 1) ++ kfd_suspend_all_processes(); + +- kfd_unbind_processes_from_device(kfd); ++ kfd->dqm->ops.stop(kfd->dqm); + +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); +- amd_iommu_free_device(kfd->pdev); ++ kfd_iommu_suspend(kfd); + } + + int kgd2kfd_resume(struct kfd_dev *kfd) + { ++ int ret, count; ++ + if (!kfd->init_complete) + return 0; + +- return kfd_resume(kfd); ++ ret = kfd_resume(kfd); ++ if (ret) ++ return ret; + ++ count = atomic_dec_return(&kfd_device_suspended); ++ WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); ++ if (count == 0) ++ ret = kfd_resume_all_processes(); ++ ++ return ret; + } + + static int kfd_resume(struct kfd_dev *kfd) + { + int err = 0; +- unsigned int pasid_limit = kfd_get_pasid_limit(); +- +- err = amd_iommu_init_device(kfd->pdev, pasid_limit); +- if (err) +- return -ENXIO; +- amd_iommu_set_invalidate_ctx_cb(kfd->pdev, +- iommu_pasid_shutdown_callback); +- amd_iommu_set_invalid_ppr_cb(kfd->pdev, +- iommu_invalid_ppr_cb); + +- err = kfd_bind_processes_to_device(kfd); +- if (err) +- goto processes_bind_error; ++ err = kfd_iommu_resume(kfd); ++ if (err) { ++ dev_err(kfd_device, ++ "Failed to resume IOMMU for device %x:%x\n", ++ kfd->pdev->vendor, kfd->pdev->device); ++ return err; ++ } + + err = kfd->dqm->ops.start(kfd->dqm); + if (err) { +@@ -413,28 +614,136 @@ static int kfd_resume(struct kfd_dev *kfd) + return err; + + dqm_start_error: +-processes_bind_error: +- amd_iommu_free_device(kfd->pdev); +- ++ kfd_iommu_suspend(kfd); + return err; + } + + /* This is called directly from KGD at ISR. */ + void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) + { ++ uint32_t patched_ihre[DIV_ROUND_UP( ++ kfd->device_info->ih_ring_entry_size, ++ sizeof(uint32_t))]; ++ bool is_patched = false; ++ + if (!kfd->init_complete) + return; + + spin_lock(&kfd->interrupt_lock); + +- if (kfd->interrupts_active +- && interrupt_is_wanted(kfd, ih_ring_entry) +- && enqueue_ih_ring_entry(kfd, ih_ring_entry)) ++ if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, ++ patched_ihre, &is_patched) ++ && enqueue_ih_ring_entry(kfd, ++ is_patched ? patched_ihre : ih_ring_entry)) + queue_work(kfd->ih_wq, &kfd->interrupt_work); + + spin_unlock(&kfd->interrupt_lock); + } + ++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) ++{ ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ int r; ++ ++ /* Because we are called from arbitrary context (workqueue) as opposed ++ * to process context, kfd_process could attempt to exit while we are ++ * running so the lookup function increments the process ref count. ++ */ ++ p = kfd_lookup_process_by_mm(mm); ++ if (!p) ++ return -ENODEV; ++ ++ if (kfd) { ++ r = -ENODEV; ++ pdd = kfd_get_process_device_data(kfd, p); ++ if (pdd) ++ r = kfd->dqm->ops.evict_process_queues(kfd->dqm, ++ &pdd->qpd); ++ } else { ++ r = kfd_process_evict_queues(p); ++ } ++ ++ kfd_unref_process(p); ++ return r; ++} ++ ++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) ++{ ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ int r; ++ ++ /* Because we are called from arbitrary context (workqueue) as opposed ++ * to process context, kfd_process could attempt to exit while we are ++ * running so the lookup function increments the process ref count. ++ */ ++ p = kfd_lookup_process_by_mm(mm); ++ if (!p) ++ return -ENODEV; ++ ++ if (kfd) { ++ r = -ENODEV; ++ pdd = kfd_get_process_device_data(kfd, p); ++ if (pdd) ++ r = kfd->dqm->ops.restore_process_queues(kfd->dqm, ++ &pdd->qpd); ++ } else { ++ r = kfd_process_restore_queues(p); ++ } ++ ++ kfd_unref_process(p); ++ return r; ++} ++ ++/** kgd2kfd_schedule_evict_and_restore_process - Schedules work queue that will ++ * prepare for safe eviction of KFD BOs that belong to the specified ++ * process. ++ * ++ * @mm: mm_struct that identifies the specified KFD process ++ * @fence: eviction fence attached to KFD process BOs ++ * ++ */ ++int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, ++ struct dma_fence *fence) ++{ ++ struct kfd_process *p; ++ unsigned long active_time; ++ unsigned long delay_jiffies = msecs_to_jiffies(PROCESS_ACTIVE_TIME_MS); ++ ++ if (!fence) ++ return -EINVAL; ++ ++ if (dma_fence_is_signaled(fence)) ++ return 0; ++ ++ p = kfd_lookup_process_by_mm(mm); ++ if (!p) ++ return -ENODEV; ++ ++ if (fence->seqno == p->last_eviction_seqno) ++ goto out; ++ ++ p->last_eviction_seqno = fence->seqno; ++ ++ /* Avoid KFD process starvation. Wait for at least ++ * PROCESS_ACTIVE_TIME_MS before evicting the process again ++ */ ++ active_time = get_jiffies_64() - p->last_restore_timestamp; ++ if (delay_jiffies > active_time) ++ delay_jiffies -= active_time; ++ else ++ delay_jiffies = 0; ++ ++ /* During process initialization eviction_work.dwork is initialized ++ * to kfd_evict_bo_worker ++ */ ++ schedule_delayed_work(&p->eviction_work, delay_jiffies); ++out: ++ kfd_unref_process(p); ++ return 0; ++} ++ + static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, + unsigned int chunk_size) + { +@@ -498,8 +807,8 @@ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, + if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size) + return -ENOMEM; + +- *mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); +- if ((*mem_obj) == NULL) ++ *mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); ++ if (!(*mem_obj)) + return -ENOMEM; + + pr_debug("Allocated mem_obj = %p for size = %d\n", *mem_obj, size); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index b21285a..f3b9ba7 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -98,6 +98,17 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm) + return dqm->dev->shared_resources.num_pipe_per_mec; + } + ++static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) ++{ ++ return dqm->dev->device_info->num_sdma_engines; ++} ++ ++unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) ++{ ++ return dqm->dev->device_info->num_sdma_engines ++ * KFD_SDMA_QUEUES_PER_ENGINE; ++} ++ + void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { +@@ -109,6 +120,57 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, + qpd->sh_mem_bases); + } + ++static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) ++{ ++ struct kfd_dev *dev = qpd->dqm->dev; ++ ++ if (!KFD_IS_SOC15(dev->device_info->asic_family)) { ++ /* On pre-SOC15 chips we need to use the queue ID to ++ * preserve the user mode ABI. ++ */ ++ q->doorbell_id = q->properties.queue_id; ++ } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { ++ /* For SDMA queues on SOC15, use static doorbell ++ * assignments based on the engine and queue. ++ */ ++ q->doorbell_id = dev->shared_resources.sdma_doorbell ++ [q->properties.sdma_engine_id] ++ [q->properties.sdma_queue_id]; ++ } else { ++ /* For CP queues on SOC15 reserve a free doorbell ID */ ++ unsigned int found; ++ ++ found = find_first_zero_bit(qpd->doorbell_bitmap, ++ KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); ++ if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { ++ pr_debug("No doorbells available"); ++ return -EBUSY; ++ } ++ set_bit(found, qpd->doorbell_bitmap); ++ q->doorbell_id = found; ++ } ++ ++ q->properties.doorbell_off = ++ kfd_doorbell_id_to_offset(dev, q->process, ++ q->doorbell_id); ++ ++ return 0; ++} ++ ++static void deallocate_doorbell(struct qcm_process_device *qpd, ++ struct queue *q) ++{ ++ unsigned int old; ++ struct kfd_dev *dev = qpd->dqm->dev; ++ ++ if (!KFD_IS_SOC15(dev->device_info->asic_family) || ++ q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ return; ++ ++ old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); ++ WARN_ON(!old); ++} ++ + static int allocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +@@ -118,9 +180,8 @@ static int allocate_vmid(struct device_queue_manager *dqm, + if (dqm->vmid_bitmap == 0) + return -ENOMEM; + +- bit = find_first_bit((unsigned long *)&dqm->vmid_bitmap, +- dqm->dev->vm_info.vmid_num_kfd); +- clear_bit(bit, (unsigned long *)&dqm->vmid_bitmap); ++ bit = ffs(dqm->vmid_bitmap) - 1; ++ dqm->vmid_bitmap &= ~(1 << bit); + + allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; + pr_debug("vmid allocation %d\n", allocated_vmid); +@@ -130,19 +191,50 @@ static int allocate_vmid(struct device_queue_manager *dqm, + set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); + program_sh_mem_settings(dqm, qpd); + ++ /* qpd->page_table_base is set earlier when register_process() ++ * is called, i.e. when the first queue is created. ++ */ ++ dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, ++ qpd->vmid, ++ qpd->page_table_base); ++ /*invalidate the VM context after pasid and vmid mapping is set up*/ ++ kfd_flush_tlb(qpd_to_pdd(qpd)); ++ + return 0; + } + ++static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, ++ struct qcm_process_device *qpd) ++{ ++ uint32_t len; ++ ++ if (!qpd->ib_kaddr) ++ return -ENOMEM; ++ ++ len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, ++ (uint32_t *)qpd->ib_kaddr); ++ ++ return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, ++ qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); ++} ++ + static void deallocate_vmid(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) + { + int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; + ++ /* On GFX v7, CP doesn't flush TC at dequeue */ ++ if (q->device->device_info->asic_family == CHIP_HAWAII) ++ if (flush_texture_cache_nocpsch(q->device, qpd)) ++ pr_err("Failed to flush TC\n"); ++ ++ kfd_flush_tlb(qpd_to_pdd(qpd)); ++ + /* Release the vmid mapping */ + set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + +- set_bit(bit, (unsigned long *)&dqm->vmid_bitmap); ++ dqm->vmid_bitmap |= (1 << bit); + qpd->vmid = 0; + q->properties.vmid = 0; + } +@@ -170,6 +262,14 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, + goto out_unlock; + } + q->properties.vmid = qpd->vmid; ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (qpd->evicted) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; +@@ -184,6 +284,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, + if (retval) { + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); ++ + goto out_unlock; + } + +@@ -223,12 +324,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) + continue; + + if (dqm->allocated_queues[pipe] != 0) { +- bit = find_first_bit( +- (unsigned long *)&dqm->allocated_queues[pipe], +- get_queues_per_pipe(dqm)); +- +- clear_bit(bit, +- (unsigned long *)&dqm->allocated_queues[pipe]); ++ bit = ffs(dqm->allocated_queues[pipe]) - 1; ++ dqm->allocated_queues[pipe] &= ~(1 << bit); + q->pipe = pipe; + q->queue = bit; + set = true; +@@ -249,7 +346,7 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q) + static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q) + { +- set_bit(q->queue, (unsigned long *)&dqm->allocated_queues[q->pipe]); ++ dqm->allocated_queues[q->pipe] |= (1 << q->queue); + } + + static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, +@@ -267,15 +364,19 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + if (retval) + return retval; + ++ retval = allocate_doorbell(qpd, q); ++ if (retval) ++ goto out_deallocate_hqd; ++ + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) +- goto out_deallocate_hqd; ++ goto out_deallocate_doorbell; + + pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", + q->pipe, q->queue); + +- dqm->dev->kfd2kgd->set_scratch_backing_va( ++ dqm->dev->kfd2kgd->alloc_memory_of_scratch( + dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); + + if (!q->properties.is_active) +@@ -290,6 +391,8 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, + + out_uninit_mqd: + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++out_deallocate_doorbell: ++ deallocate_doorbell(qpd, q); + out_deallocate_hqd: + deallocate_hqd(dqm, q); + +@@ -323,6 +426,8 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, + } + dqm->total_queue_count--; + ++ deallocate_doorbell(qpd, q); ++ + retval = mqd->destroy_mqd(mqd, q->mqd, + KFD_PREEMPT_TYPE_WAVEFRONT_RESET, + KFD_UNMAP_LATENCY_MS, +@@ -371,21 +476,37 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) + { + int retval; + struct mqd_manager *mqd; ++ struct kfd_process_device *pdd; ++ + bool prev_active = false; + + mutex_lock(&dqm->lock); ++ ++ pdd = kfd_get_process_device_data(q->device, q->process); ++ if (!pdd) { ++ retval = -ENODEV; ++ goto out_unlock; ++ } + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + if (!mqd) { + retval = -ENOMEM; + goto out_unlock; + } ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (pdd->qpd.evicted > 0) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + + /* Save previous activity state for counters */ + prev_active = q->properties.is_active; + + /* Make sure the queue is unmapped before updating the MQD */ +- if (sched_policy != KFD_SCHED_POLICY_NO_HWS) { ++ if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { + retval = unmap_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); + if (retval) { +@@ -417,7 +538,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) + else if (!q->properties.is_active && prev_active) + dqm->queue_count--; + +- if (sched_policy != KFD_SCHED_POLICY_NO_HWS) ++ if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = map_queues_cpsch(dqm); + else if (q->properties.is_active && + (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || +@@ -451,11 +572,194 @@ static struct mqd_manager *get_mqd_manager( + return mqd; + } + ++static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q; ++ struct mqd_manager *mqd; ++ struct kfd_process_device *pdd; ++ int retval = 0; ++ ++ mutex_lock(&dqm->lock); ++ if (qpd->evicted++ > 0) /* already evicted, do nothing */ ++ goto out; ++ ++ pdd = qpd_to_pdd(qpd); ++ pr_info_ratelimited("Evicting PASID %u queues\n", ++ pdd->process->pasid); ++ ++ /* unactivate all active queues on the qpd */ ++ list_for_each_entry(q, &qpd->queues_list, list) { ++ if (!q->properties.is_active) ++ continue; ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { /* should not be here */ ++ pr_err("Cannot evict queue, mqd mgr is NULL\n"); ++ retval = -ENOMEM; ++ goto out; ++ } ++ q->properties.is_evicted = true; ++ q->properties.is_active = false; ++ retval = mqd->destroy_mqd(mqd, q->mqd, ++ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, ++ KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); ++ if (retval) ++ goto out; ++ dqm->queue_count--; ++ } ++ ++out: ++ mutex_unlock(&dqm->lock); ++ return retval; ++} ++ ++static int evict_process_queues_cpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q; ++ struct kfd_process_device *pdd; ++ int retval = 0; ++ ++ mutex_lock(&dqm->lock); ++ if (qpd->evicted++ > 0) /* already evicted, do nothing */ ++ goto out; ++ ++ pdd = qpd_to_pdd(qpd); ++ pr_info_ratelimited("Evicting PASID %u queues\n", ++ pdd->process->pasid); ++ ++ /* unactivate all active queues on the qpd */ ++ list_for_each_entry(q, &qpd->queues_list, list) { ++ if (!q->properties.is_active) ++ continue; ++ q->properties.is_evicted = true; ++ q->properties.is_active = false; ++ dqm->queue_count--; ++ } ++ retval = execute_queues_cpsch(dqm, ++ qpd->is_debug ? ++ KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : ++ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); ++ ++out: ++ mutex_unlock(&dqm->lock); ++ return retval; ++} ++ ++static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q; ++ struct mqd_manager *mqd; ++ struct kfd_process_device *pdd; ++ uint32_t pd_base; ++ int retval = 0; ++ ++ pdd = qpd_to_pdd(qpd); ++ /* Retrieve PD base */ ++ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); ++ ++ mutex_lock(&dqm->lock); ++ if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ ++ goto out; ++ if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ ++ qpd->evicted--; ++ goto out; ++ } ++ ++ pr_info_ratelimited("Restoring PASID %u queues\n", ++ pdd->process->pasid); ++ ++ /* Update PD Base in QPD */ ++ qpd->page_table_base = pd_base; ++ pr_debug("Updated PD address to 0x%08x\n", pd_base); ++ ++ if (!list_empty(&qpd->queues_list)) { ++ dqm->dev->kfd2kgd->set_vm_context_page_table_base( ++ dqm->dev->kgd, ++ qpd->vmid, ++ qpd->page_table_base); ++ kfd_flush_tlb(pdd); ++ } ++ ++ /* activate all active queues on the qpd */ ++ list_for_each_entry(q, &qpd->queues_list, list) { ++ if (!q->properties.is_evicted) ++ continue; ++ mqd = dqm->ops.get_mqd_manager(dqm, ++ get_mqd_type_from_queue_type(q->properties.type)); ++ if (!mqd) { /* should not be here */ ++ pr_err("Cannot restore queue, mqd mgr is NULL\n"); ++ retval = -ENOMEM; ++ goto out; ++ } ++ q->properties.is_evicted = false; ++ q->properties.is_active = true; ++ retval = mqd->load_mqd(mqd, q->mqd, q->pipe, ++ q->queue, &q->properties, ++ q->process->mm); ++ if (retval) ++ goto out; ++ dqm->queue_count++; ++ } ++ qpd->evicted = 0; ++out: ++ mutex_unlock(&dqm->lock); ++ return retval; ++} ++ ++static int restore_process_queues_cpsch(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct queue *q; ++ struct kfd_process_device *pdd; ++ uint32_t pd_base; ++ int retval = 0; ++ ++ pdd = qpd_to_pdd(qpd); ++ /* Retrieve PD base */ ++ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); ++ ++ mutex_lock(&dqm->lock); ++ if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ ++ goto out; ++ if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */ ++ qpd->evicted--; ++ goto out; ++ } ++ ++ pr_info_ratelimited("Restoring PASID %u queues\n", ++ pdd->process->pasid); ++ ++ /* Update PD Base in QPD */ ++ qpd->page_table_base = pd_base; ++ pr_debug("Updated PD address to 0x%08x\n", pd_base); ++ ++ /* activate all active queues on the qpd */ ++ list_for_each_entry(q, &qpd->queues_list, list) { ++ if (!q->properties.is_evicted) ++ continue; ++ q->properties.is_evicted = false; ++ q->properties.is_active = true; ++ dqm->queue_count++; ++ } ++ retval = execute_queues_cpsch(dqm, ++ KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); ++ if (!retval) ++ qpd->evicted = 0; ++out: ++ mutex_unlock(&dqm->lock); ++ return retval; ++} ++ + static int register_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { + struct device_process_node *n; + int retval; ++ struct kfd_process_device *pdd; ++ uint32_t pd_base; + + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (!n) +@@ -463,9 +767,17 @@ static int register_process(struct device_queue_manager *dqm, + + n->qpd = qpd; + ++ pdd = qpd_to_pdd(qpd); ++ /* Retrieve PD base */ ++ pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm); ++ + mutex_lock(&dqm->lock); + list_add(&n->list, &dqm->queues); + ++ /* Update PD Base in QPD */ ++ qpd->page_table_base = pd_base; ++ pr_debug("Updated PD address to 0x%08x\n", pd_base); ++ + retval = dqm->asic_ops.update_qpd(dqm, qpd); + + dqm->processes_count++; +@@ -552,7 +864,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) + } + + dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; +- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; ++ dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + + return 0; + } +@@ -573,11 +885,12 @@ static void uninitialize(struct device_queue_manager *dqm) + static int start_nocpsch(struct device_queue_manager *dqm) + { + init_interrupts(dqm); +- return 0; ++ return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + } + + static int stop_nocpsch(struct device_queue_manager *dqm) + { ++ pm_uninit(&dqm->packets); + return 0; + } + +@@ -589,10 +902,8 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + +- bit = find_first_bit((unsigned long *)&dqm->sdma_bitmap, +- CIK_SDMA_QUEUES); +- +- clear_bit(bit, (unsigned long *)&dqm->sdma_bitmap); ++ bit = ffs(dqm->sdma_bitmap) - 1; ++ dqm->sdma_bitmap &= ~(1 << bit); + *sdma_queue_id = bit; + + return 0; +@@ -601,9 +912,9 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, + static void deallocate_sdma_queue(struct device_queue_manager *dqm, + unsigned int sdma_queue_id) + { +- if (sdma_queue_id >= CIK_SDMA_QUEUES) ++ if (sdma_queue_id >= get_num_sdma_queues(dqm)) + return; +- set_bit(sdma_queue_id, (unsigned long *)&dqm->sdma_bitmap); ++ dqm->sdma_bitmap |= (1 << sdma_queue_id); + } + + static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, +@@ -621,8 +932,12 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + if (retval) + return retval; + +- q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; +- q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; ++ q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); ++ q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); ++ ++ retval = allocate_doorbell(qpd, q); ++ if (retval) ++ goto out_deallocate_sdma_queue; + + pr_debug("SDMA id is: %d\n", q->sdma_id); + pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); +@@ -632,7 +947,7 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) +- goto out_deallocate_sdma_queue; ++ goto out_deallocate_doorbell; + + retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); + if (retval) +@@ -642,6 +957,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, + + out_uninit_mqd: + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); ++out_deallocate_doorbell: ++ deallocate_doorbell(qpd, q); + out_deallocate_sdma_queue: + deallocate_sdma_queue(dqm, q->sdma_id); + +@@ -702,7 +1019,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->active_runlist = false; +- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1; ++ dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + + return 0; + } +@@ -713,7 +1030,7 @@ static int start_cpsch(struct device_queue_manager *dqm) + + retval = 0; + +- retval = pm_init(&dqm->packets, dqm); ++ retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); + if (retval) + goto fail_packet_manager_init; + +@@ -750,7 +1067,9 @@ static int start_cpsch(struct device_queue_manager *dqm) + static int stop_cpsch(struct device_queue_manager *dqm) + { + mutex_lock(&dqm->lock); ++ + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); ++ + mutex_unlock(&dqm->lock); + + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); +@@ -821,25 +1140,38 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + pr_warn("Can't create new usermode queue because %d queues were already created\n", + dqm->total_queue_count); + retval = -EPERM; +- goto out; ++ goto out_unlock; + } + + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (retval) +- goto out; ++ goto out_unlock; + q->properties.sdma_queue_id = +- q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE; ++ q->sdma_id / get_num_sdma_engines(dqm); + q->properties.sdma_engine_id = +- q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE; ++ q->sdma_id % get_num_sdma_engines(dqm); + } ++ ++ retval = allocate_doorbell(qpd, q); ++ if (retval) ++ goto out_deallocate_sdma_queue; ++ + mqd = dqm->ops.get_mqd_manager(dqm, + get_mqd_type_from_queue_type(q->properties.type)); + + if (!mqd) { + retval = -ENOMEM; +- goto out; ++ goto out_deallocate_doorbell; + } ++ /* ++ * Eviction state logic: we only mark active queues as evicted ++ * to avoid the overhead of restoring inactive queues later ++ */ ++ if (qpd->evicted) ++ q->properties.is_evicted = (q->properties.queue_size > 0 && ++ q->properties.queue_percent > 0 && ++ q->properties.queue_address != 0); + + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + +@@ -848,7 +1180,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (retval) +- goto out; ++ goto out_deallocate_doorbell; + + list_add(&q->list, &qpd->queues_list); + qpd->queue_count++; +@@ -869,9 +1201,18 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, + pr_debug("Total of %d queues are accountable so far\n", + dqm->total_queue_count); + +-out: + mutex_unlock(&dqm->lock); + return retval; ++ ++out_deallocate_doorbell: ++ deallocate_doorbell(qpd, q); ++out_deallocate_sdma_queue: ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ deallocate_sdma_queue(dqm, q->sdma_id); ++out_unlock: ++ mutex_unlock(&dqm->lock); ++ ++ return retval; + } + + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, +@@ -1006,6 +1347,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + goto failed; + } + ++ deallocate_doorbell(qpd, q); ++ + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; + deallocate_sdma_queue(dqm, q->sdma_id); +@@ -1057,7 +1400,10 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + void __user *alternate_aperture_base, + uint64_t alternate_aperture_size) + { +- bool retval; ++ bool retval = true; ++ ++ if (!dqm->asic_ops.set_cache_memory_policy) ++ return retval; + + mutex_lock(&dqm->lock); + +@@ -1097,7 +1443,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm, + alternate_aperture_base, + alternate_aperture_size); + +- if ((sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) ++ if ((dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) && (qpd->vmid != 0)) + program_sh_mem_settings(dqm, qpd); + + pr_debug("sh_mem_config: 0x%x, ape1_base: 0x%x, ape1_limit: 0x%x\n", +@@ -1161,6 +1507,41 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, + return retval; + } + ++static int get_wave_state(struct device_queue_manager *dqm, ++ struct queue *q, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size) ++{ ++ struct mqd_manager *mqd; ++ int r; ++ ++ mutex_lock(&dqm->lock); ++ ++ if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE || ++ q->properties.is_active || !q->device->cwsr_enabled) { ++ r = -EINVAL; ++ goto dqm_unlock; ++ } ++ ++ mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); ++ if (!mqd) { ++ r = -ENOMEM; ++ goto dqm_unlock; ++ } ++ ++ if (!mqd->get_wave_state) { ++ r = -EINVAL; ++ goto dqm_unlock; ++ } ++ ++ r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, ++ save_area_used_size); ++ ++dqm_unlock: ++ mutex_unlock(&dqm->lock); ++ return r; ++} + + static int process_termination_cpsch(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +@@ -1188,8 +1569,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, + + /* Clear all user mode queues */ + list_for_each_entry(q, &qpd->queues_list, list) { +- if (q->properties.type == KFD_QUEUE_TYPE_SDMA) ++ if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + dqm->sdma_queue_count--; ++ deallocate_sdma_queue(dqm, q->sdma_id); ++ } + + if (q->properties.is_active) + dqm->queue_count--; +@@ -1242,8 +1625,18 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + if (!dqm) + return NULL; + ++ switch (dev->device_info->asic_family) { ++ case CHIP_HAWAII: ++ case CHIP_TONGA: ++ dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; ++ break; ++ default: ++ dqm->sched_policy = sched_policy; ++ break; ++ } ++ + dqm->dev = dev; +- switch (sched_policy) { ++ switch (dqm->sched_policy) { + case KFD_SCHED_POLICY_HWS: + case KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION: + /* initialize dqm for cp scheduling */ +@@ -1262,6 +1655,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.process_termination = process_termination_cpsch; ++ dqm->ops.evict_process_queues = evict_process_queues_cpsch; ++ dqm->ops.restore_process_queues = restore_process_queues_cpsch; ++ dqm->ops.get_wave_state = get_wave_state; + break; + case KFD_SCHED_POLICY_NO_HWS: + /* initialize dqm for no cp scheduling */ +@@ -1278,9 +1674,13 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + dqm->ops.set_cache_memory_policy = set_cache_memory_policy; + dqm->ops.set_trap_handler = set_trap_handler; + dqm->ops.process_termination = process_termination_nocpsch; ++ dqm->ops.evict_process_queues = evict_process_queues_nocpsch; ++ dqm->ops.restore_process_queues = ++ restore_process_queues_nocpsch; ++ dqm->ops.get_wave_state = get_wave_state; + break; + default: +- pr_err("Invalid scheduling policy %d\n", sched_policy); ++ pr_err("Invalid scheduling policy %d\n", dqm->sched_policy); + goto out_free; + } + +@@ -1292,6 +1692,22 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) + case CHIP_KAVERI: + device_queue_manager_init_cik(&dqm->asic_ops); + break; ++ ++ case CHIP_HAWAII: ++ device_queue_manager_init_cik_hawaii(&dqm->asic_ops); ++ break; ++ ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: ++ device_queue_manager_init_vi_tonga(&dqm->asic_ops); ++ break; ++ ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ device_queue_manager_init_v9_vega10(&dqm->asic_ops); ++ break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); +@@ -1312,6 +1728,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) + kfree(dqm); + } + ++int kfd_process_vm_fault(struct device_queue_manager *dqm, ++ unsigned int pasid) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); ++ int ret = 0; ++ ++ if (!p) ++ return -EINVAL; ++ pdd = kfd_get_process_device_data(dqm->dev, p); ++ if (pdd) ++ ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); ++ kfd_unref_process(p); ++ ++ return ret; ++} ++ + #if defined(CONFIG_DEBUG_FS) + + static void seq_reg_dump(struct seq_file *m, +@@ -1363,8 +1796,8 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) + } + } + +- for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) { +- for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) { ++ for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { ++ for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { + r = dqm->dev->kfd2kgd->hqd_sdma_dump( + dqm->dev->kgd, pipe, queue, &dump, &n_regs); + if (r) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +index c61b693..978458a 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +@@ -31,10 +31,7 @@ + + #define KFD_UNMAP_LATENCY_MS (4000) + #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) +- +-#define CIK_SDMA_QUEUES (4) +-#define CIK_SDMA_QUEUES_PER_ENGINE (2) +-#define CIK_SDMA_ENGINE_NUM (2) ++#define KFD_SDMA_QUEUES_PER_ENGINE (2) + + struct device_process_node { + struct qcm_process_device *qpd; +@@ -79,6 +76,12 @@ struct device_process_node { + * + * @process_termination: Clears all process queues belongs to that device. + * ++ * @evict_process_queues: Evict all active queues of a process ++ * ++ * @restore_process_queues: Restore all evicted queues queues of a process ++ * ++ * @get_wave_state: Retrieves context save state and optionally copies the ++ * control stack, if kept in the MQD, to the given userspace address. + */ + + struct device_queue_manager_ops { +@@ -129,6 +132,17 @@ struct device_queue_manager_ops { + + int (*process_termination)(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); ++ ++ int (*evict_process_queues)(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++ int (*restore_process_queues)(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++ ++ int (*get_wave_state)(struct device_queue_manager *dqm, ++ struct queue *q, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size); + }; + + struct device_queue_manager_asic_ops { +@@ -180,23 +194,38 @@ struct device_queue_manager { + unsigned int *fence_addr; + struct kfd_mem_obj *fence_mem; + bool active_runlist; ++ int sched_policy; + }; + + void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops); ++void device_queue_manager_init_cik_hawaii( ++ struct device_queue_manager_asic_ops *asic_ops); + void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops); ++void device_queue_manager_init_vi_tonga( ++ struct device_queue_manager_asic_ops *asic_ops); ++void device_queue_manager_init_v9_vega10( ++ struct device_queue_manager_asic_ops *asic_ops); + void program_sh_mem_settings(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + unsigned int get_queues_num(struct device_queue_manager *dqm); + unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); + unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); ++unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); ++ ++int process_evict_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++int process_restore_queues(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++ + + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) + { + return (pdd->lds_base >> 16) & 0xFF; + } + ++/* This function is only useful for GFXv7 and v8 */ + static inline unsigned int + get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) + { +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +index 28e48c9..aed4c21 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +@@ -34,8 +34,13 @@ static bool set_cache_memory_policy_cik(struct device_queue_manager *dqm, + uint64_t alternate_aperture_size); + static int update_qpd_cik(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); ++static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); ++static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd); + + void device_queue_manager_init_cik( + struct device_queue_manager_asic_ops *asic_ops) +@@ -45,6 +50,14 @@ void device_queue_manager_init_cik( + asic_ops->init_sdma_vm = init_sdma_vm; + } + ++void device_queue_manager_init_cik_hawaii( ++ struct device_queue_manager_asic_ops *asic_ops) ++{ ++ asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; ++ asic_ops->update_qpd = update_qpd_cik_hawaii; ++ asic_ops->init_sdma_vm = init_sdma_vm_hawaii; ++} ++ + static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) + { + /* In 64-bit mode, we can only control the top 3 bits of the LDS, +@@ -132,6 +145,36 @@ static int update_qpd_cik(struct device_queue_manager *dqm, + return 0; + } + ++static int update_qpd_cik_hawaii(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ unsigned int temp; ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* check if sh_mem_config register already configured */ ++ if (qpd->sh_mem_config == 0) { ++ qpd->sh_mem_config = ++ ALIGNMENT_MODE(SH_MEM_ALIGNMENT_MODE_UNALIGNED) | ++ DEFAULT_MTYPE(MTYPE_NONCACHED) | ++ APE1_MTYPE(MTYPE_NONCACHED); ++ qpd->sh_mem_ape1_limit = 0; ++ qpd->sh_mem_ape1_base = 0; ++ } ++ ++ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit ++ * aperture addresses. ++ */ ++ temp = get_sh_mem_bases_nybble_64(pdd); ++ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); ++ ++ pr_debug("is32bit process: %d sh_mem_bases nybble: 0x%X and register 0x%X\n", ++ qpd->pqm->process->is_32bit_user_mode, temp, qpd->sh_mem_bases); ++ ++ return 0; ++} ++ + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) + { +@@ -147,3 +190,16 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + + q->properties.sdma_vm_addr = value; + } ++ ++static void init_sdma_vm_hawaii(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd) ++{ ++ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit ++ * aperture addresses. ++ */ ++ q->properties.sdma_vm_addr = ++ ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +new file mode 100644 +index 0000000..6198bf2 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +@@ -0,0 +1,84 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include "kfd_device_queue_manager.h" ++#include "vega10_enum.h" ++#include "gc/gc_9_0_offset.h" ++#include "gc/gc_9_0_sh_mask.h" ++#include "sdma0/sdma0_4_0_sh_mask.h" ++ ++static int update_qpd_v9(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, ++ struct qcm_process_device *qpd); ++ ++void device_queue_manager_init_v9_vega10( ++ struct device_queue_manager_asic_ops *asic_ops) ++{ ++ asic_ops->update_qpd = update_qpd_v9; ++ asic_ops->init_sdma_vm = init_sdma_vm_v9; ++} ++ ++static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) ++{ ++ uint32_t shared_base = pdd->lds_base >> 48; ++ uint32_t private_base = pdd->scratch_base >> 48; ++ ++ return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | ++ private_base; ++} ++ ++static int update_qpd_v9(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* check if sh_mem_config register already configured */ ++ if (qpd->sh_mem_config == 0) { ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; ++ if (vega10_noretry && ++ !dqm->dev->device_info->needs_iommu_device) ++ qpd->sh_mem_config |= ++ 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; ++ ++ qpd->sh_mem_ape1_limit = 0; ++ qpd->sh_mem_ape1_base = 0; ++ } ++ ++ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); ++ ++ pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); ++ ++ return 0; ++} ++ ++static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, ++ struct qcm_process_device *qpd) ++{ ++ /* Not needed on SDMAv4 any more */ ++ q->properties.sdma_vm_addr = 0; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +index 2fbce57..030b014 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +@@ -38,6 +38,30 @@ static int update_qpd_vi(struct device_queue_manager *dqm, + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + ++/* ++ * Tonga device queue manager functions ++ */ ++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ enum cache_policy default_policy, ++ enum cache_policy alternate_policy, ++ void __user *alternate_aperture_base, ++ uint64_t alternate_aperture_size); ++static int update_qpd_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd); ++static void init_sdma_vm_tonga(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd); ++ ++void device_queue_manager_init_vi_tonga( ++ struct device_queue_manager_asic_ops *asic_ops) ++{ ++ asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; ++ asic_ops->update_qpd = update_qpd_vi_tonga; ++ asic_ops->init_sdma_vm = init_sdma_vm_tonga; ++} ++ ++ + void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops) + { +@@ -103,6 +127,33 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, + return true; + } + ++static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd, ++ enum cache_policy default_policy, ++ enum cache_policy alternate_policy, ++ void __user *alternate_aperture_base, ++ uint64_t alternate_aperture_size) ++{ ++ uint32_t default_mtype; ++ uint32_t ape1_mtype; ++ ++ default_mtype = (default_policy == cache_policy_coherent) ? ++ MTYPE_UC : ++ MTYPE_NC; ++ ++ ape1_mtype = (alternate_policy == cache_policy_coherent) ? ++ MTYPE_UC : ++ MTYPE_NC; ++ ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | ++ default_mtype << SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ++ ape1_mtype << SH_MEM_CONFIG__APE1_MTYPE__SHIFT; ++ ++ return true; ++} ++ + static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) + { +@@ -144,6 +195,40 @@ static int update_qpd_vi(struct device_queue_manager *dqm, + return 0; + } + ++static int update_qpd_vi_tonga(struct device_queue_manager *dqm, ++ struct qcm_process_device *qpd) ++{ ++ struct kfd_process_device *pdd; ++ unsigned int temp; ++ ++ pdd = qpd_to_pdd(qpd); ++ ++ /* check if sh_mem_config register already configured */ ++ if (qpd->sh_mem_config == 0) { ++ qpd->sh_mem_config = ++ SH_MEM_ALIGNMENT_MODE_UNALIGNED << ++ SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT | ++ MTYPE_UC << ++ SH_MEM_CONFIG__DEFAULT_MTYPE__SHIFT | ++ MTYPE_UC << ++ SH_MEM_CONFIG__APE1_MTYPE__SHIFT; ++ ++ qpd->sh_mem_ape1_limit = 0; ++ qpd->sh_mem_ape1_base = 0; ++ } ++ ++ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit ++ * aperture addresses. ++ */ ++ temp = get_sh_mem_bases_nybble_64(pdd); ++ qpd->sh_mem_bases = compute_sh_mem_bases_64bit(temp); ++ ++ pr_debug("sh_mem_bases nybble: 0x%X and register 0x%X\n", ++ temp, qpd->sh_mem_bases); ++ ++ return 0; ++} ++ + static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) + { +@@ -159,3 +244,16 @@ static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + + q->properties.sdma_vm_addr = value; + } ++ ++static void init_sdma_vm_tonga(struct device_queue_manager *dqm, ++ struct queue *q, ++ struct qcm_process_device *qpd) ++{ ++ /* On dGPU we're always in GPUVM64 addressing mode with 64-bit ++ * aperture addresses. ++ */ ++ q->properties.sdma_vm_addr = ++ ((get_sh_mem_bases_nybble_64(qpd_to_pdd(qpd))) << ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE__SHIFT) & ++ SDMA0_RLC0_VIRTUAL_ADDR__SHARED_BASE_MASK; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +index ebb4da14..fc41689 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +@@ -33,7 +33,6 @@ + + static DEFINE_IDA(doorbell_ida); + static unsigned int max_doorbell_slices; +-#define KFD_SIZE_OF_DOORBELL_IN_BYTES 4 + + /* + * Each device exposes a doorbell aperture, a PCI MMIO aperture that +@@ -50,9 +49,9 @@ static unsigned int max_doorbell_slices; + */ + + /* # of doorbell bytes allocated for each process. */ +-static inline size_t doorbell_process_allocation(void) ++size_t kfd_doorbell_process_slice(struct kfd_dev *kfd) + { +- return roundup(KFD_SIZE_OF_DOORBELL_IN_BYTES * ++ return roundup(kfd->device_info->doorbell_size * + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + PAGE_SIZE); + } +@@ -72,16 +71,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd) + + doorbell_start_offset = + roundup(kfd->shared_resources.doorbell_start_offset, +- doorbell_process_allocation()); ++ kfd_doorbell_process_slice(kfd)); + + doorbell_aperture_size = + rounddown(kfd->shared_resources.doorbell_aperture_size, +- doorbell_process_allocation()); ++ kfd_doorbell_process_slice(kfd)); + + if (doorbell_aperture_size > doorbell_start_offset) + doorbell_process_limit = + (doorbell_aperture_size - doorbell_start_offset) / +- doorbell_process_allocation(); ++ kfd_doorbell_process_slice(kfd); + else + return -ENOSPC; + +@@ -95,7 +94,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) + kfd->doorbell_id_offset = doorbell_start_offset / sizeof(u32); + + kfd->doorbell_kernel_ptr = ioremap(kfd->doorbell_base, +- doorbell_process_allocation()); ++ kfd_doorbell_process_slice(kfd)); + + if (!kfd->doorbell_kernel_ptr) + return -ENOMEM; +@@ -116,7 +115,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) + pr_debug("doorbell aperture size == 0x%08lX\n", + kfd->shared_resources.doorbell_aperture_size); + +- pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr); ++ pr_debug("doorbell kernel address == 0x%p\n", kfd->doorbell_kernel_ptr); + + return 0; + } +@@ -127,21 +126,16 @@ void kfd_doorbell_fini(struct kfd_dev *kfd) + iounmap(kfd->doorbell_kernel_ptr); + } + +-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) ++int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, ++ struct vm_area_struct *vma) + { + phys_addr_t address; +- struct kfd_dev *dev; + + /* + * For simplicitly we only allow mapping of the entire doorbell + * allocation of a single device & process. + */ +- if (vma->vm_end - vma->vm_start != doorbell_process_allocation()) +- return -EINVAL; +- +- /* Find kfd device according to gpu id */ +- dev = kfd_device_by_id(vma->vm_pgoff); +- if (!dev) ++ if (vma->vm_end - vma->vm_start != kfd_doorbell_process_slice(dev)) + return -EINVAL; + + /* Calculate physical address of doorbell */ +@@ -158,19 +152,19 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma) + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + (unsigned long long) vma->vm_start, address, vma->vm_flags, +- doorbell_process_allocation()); ++ kfd_doorbell_process_slice(dev)); + + + return io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, +- doorbell_process_allocation(), ++ kfd_doorbell_process_slice(dev), + vma->vm_page_prot); + } + + + /* get kernel iomem pointer for a doorbell */ +-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off) + { + u32 inx; +@@ -185,6 +179,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) + return NULL; + ++ inx *= kfd->device_info->doorbell_size / sizeof(u32); ++ + /* + * Calculating the kernel doorbell offset using the first + * doorbell page. +@@ -193,7 +189,7 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + + pr_debug("Get kernel queue doorbell\n" + " doorbell offset == 0x%08X\n" +- " kernel address == %p\n", ++ " kernel address == 0x%p\n", + *doorbell_off, (kfd->doorbell_kernel_ptr + inx)); + + return kfd->doorbell_kernel_ptr + inx; +@@ -210,11 +206,21 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) + mutex_unlock(&kfd->doorbell_mutex); + } + +-inline void write_kernel_doorbell(u32 __iomem *db, u32 value) ++void write_kernel_doorbell(void __iomem *db, u32 value) + { + if (db) { + writel(value, db); +- pr_debug("Writing %d to doorbell address %p\n", value, db); ++ pr_debug("Writing %d to doorbell address 0x%p\n", value, db); ++ } ++} ++ ++void write_kernel_doorbell64(void __iomem *db, u64 value) ++{ ++ if (db) { ++ WARN(((unsigned long)db & 7) != 0, ++ "Unaligned 64-bit doorbell"); ++ writeq(value, (u64 __iomem *)db); ++ pr_debug("writing %llu to doorbell address 0x%p\n", value, db); + } + } + +@@ -222,26 +228,26 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value) + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ +-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, ++unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int queue_id) ++ unsigned int doorbell_id) + { + /* + * doorbell_id_offset accounts for doorbells taken by KGD. +- * index * doorbell_process_allocation/sizeof(u32) adjusts to +- * the process's doorbells. ++ * index * kfd_doorbell_process_slice/sizeof(u32) adjusts to ++ * the process's doorbells. The offset returned is in dword ++ * units regardless of the ASIC-dependent doorbell size. + */ + return kfd->doorbell_id_offset + +- process->doorbell_index +- * doorbell_process_allocation() / sizeof(u32) + +- queue_id; ++ process->doorbell_index * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + ++ doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); + } + + uint64_t kfd_get_number_elems(struct kfd_dev *kfd) + { + uint64_t num_of_elems = (kfd->shared_resources.doorbell_aperture_size - + kfd->shared_resources.doorbell_start_offset) / +- doorbell_process_allocation() + 1; ++ kfd_doorbell_process_slice(kfd) + 1; + + return num_of_elems; + +@@ -251,7 +257,7 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process) + { + return dev->doorbell_base + +- process->doorbell_index * doorbell_process_allocation(); ++ process->doorbell_index * kfd_doorbell_process_slice(dev); + } + + int kfd_alloc_process_doorbells(struct kfd_process *process) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +index 93aae5c..a92ca78 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c +@@ -30,6 +30,7 @@ + #include <linux/memory.h> + #include "kfd_priv.h" + #include "kfd_events.h" ++#include "kfd_iommu.h" + #include <linux/device.h> + + /* +@@ -50,6 +51,7 @@ struct kfd_event_waiter { + */ + struct kfd_signal_page { + uint64_t *kernel_address; ++ uint64_t handle; + uint64_t __user *user_address; + }; + +@@ -97,17 +99,9 @@ static int allocate_event_notification_slot(struct kfd_process *p, + p->signal_page = allocate_signal_page(p); + if (!p->signal_page) + return -ENOMEM; +- /* Oldest user mode expects 256 event slots */ +- p->signal_mapped_size = 256*8; + } + +- /* +- * Compatibility with old user mode: Only use signal slots +- * user mode has mapped, may be less than +- * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase +- * of the event limit without breaking user mode. +- */ +- id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8, ++ id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT, + GFP_KERNEL); + if (id < 0) + return id; +@@ -118,6 +112,29 @@ static int allocate_event_notification_slot(struct kfd_process *p, + return 0; + } + ++static struct kfd_signal_page *allocate_signal_page_dgpu( ++ struct kfd_process *p, uint64_t *kernel_address, uint64_t handle) ++{ ++ struct kfd_signal_page *my_page; ++ ++ my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); ++ if (!my_page) ++ return NULL; ++ ++ /* Initialize all events to unsignaled */ ++ memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, ++ KFD_SIGNAL_EVENT_LIMIT * 8); ++ ++ my_page->kernel_address = kernel_address; ++ my_page->handle = handle; ++ my_page->user_address = NULL; ++ ++ pr_debug("Allocated new event signal page at %p, for process %p\n", ++ my_page, p); ++ ++ return my_page; ++} ++ + /* + * Assumes that p->event_mutex is held and of course that p is not going + * away (current or locked). +@@ -181,8 +198,7 @@ static int create_signal_event(struct file *devkfd, + { + int ret; + +- if (p->signal_mapped_size && +- p->signal_event_count == p->signal_mapped_size / 8) { ++ if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { + if (!p->signal_event_limit_reached) { + pr_warn("Signal event wasn't created because limit was reached\n"); + p->signal_event_limit_reached = true; +@@ -268,8 +284,9 @@ static void shutdown_signal_page(struct kfd_process *p) + struct kfd_signal_page *page = p->signal_page; + + if (page) { +- free_pages((unsigned long)page->kernel_address, +- get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); ++ if (page->user_address) ++ free_pages((unsigned long)page->kernel_address, ++ get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + kfree(page); + } + } +@@ -294,7 +311,8 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) + int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index) ++ uint64_t *event_page_offset, uint32_t *event_slot_index, ++ void *kern_addr) + { + int ret = 0; + struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); +@@ -308,16 +326,25 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, + + init_waitqueue_head(&ev->wq); + +- *event_page_offset = 0; +- + mutex_lock(&p->event_mutex); + ++ if (kern_addr && !p->signal_page) { ++ p->signal_page = allocate_signal_page_dgpu(p, kern_addr, ++ *event_page_offset); ++ if (!p->signal_page) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ *event_page_offset = 0; ++ + switch (event_type) { + case KFD_EVENT_TYPE_SIGNAL: + case KFD_EVENT_TYPE_DEBUG: + ret = create_signal_event(devkfd, p, ev); + if (!ret) { +- *event_page_offset = KFD_MMAP_EVENTS_MASK; ++ *event_page_offset = KFD_MMAP_TYPE_EVENTS; + *event_page_offset <<= PAGE_SHIFT; + *event_slot_index = ev->event_id; + } +@@ -334,6 +361,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, + kfree(ev); + } + ++out: + mutex_unlock(&p->event_mutex); + + return ret; +@@ -362,11 +390,7 @@ static void set_event(struct kfd_event *ev) + { + struct kfd_event_waiter *waiter; + +- /* Auto reset if the list is non-empty and we're waking +- * someone. waitqueue_active is safe here because we're +- * protected by the p->event_mutex, which is also held when +- * updating the wait queues in kfd_wait_on_events. +- */ ++ /* Auto reset if the list is non-empty and we're waking someone. */ + ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq); + + list_for_each_entry(waiter, &ev->wq.head, wait.entry) +@@ -468,7 +492,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id, + pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n", + partial_id, valid_id_bits); + +- if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/2) { ++ if (p->signal_event_count < KFD_SIGNAL_EVENT_LIMIT/64) { + /* With relatively few events, it's faster to + * iterate over the event IDR + */ +@@ -753,12 +777,12 @@ int kfd_wait_on_events(struct kfd_process *p, + + int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + { ++ + unsigned long pfn; + struct kfd_signal_page *page; +- int ret; + +- /* check required size doesn't exceed the allocated size */ +- if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) < ++ /* check required size is logical */ ++ if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != + get_order(vma->vm_end - vma->vm_start)) { + pr_err("Event page mmap requested illegal size\n"); + return -EINVAL; +@@ -788,12 +812,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) + page->user_address = (uint64_t __user *)vma->vm_start; + + /* mapping the page to user process */ +- ret = remap_pfn_range(vma, vma->vm_start, pfn, ++ return remap_pfn_range(vma, vma->vm_start, pfn, + vma->vm_end - vma->vm_start, vma->vm_page_prot); +- if (!ret) +- p->signal_mapped_size = vma->vm_end - vma->vm_start; +- +- return ret; + } + + /* +@@ -822,6 +842,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, + ev->memory_exception_data = *ev_data; + } + ++ if (type == KFD_EVENT_TYPE_MEMORY) { ++ dev_warn(kfd_device, ++ "Sending SIGSEGV to HSA Process with PID %d ", ++ p->lead_thread->pid); ++ send_sig(SIGSEGV, p->lead_thread, 0); ++ } ++ + /* Send SIGTERM no event of type "type" has been found*/ + if (send_signal) { + if (send_sigterm) { +@@ -837,6 +864,7 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, + } + } + ++#ifdef KFD_SUPPORT_IOMMU_V2 + void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + unsigned long address, bool is_write_requested, + bool is_execute_requested) +@@ -896,15 +924,28 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, + up_read(&mm->mmap_sem); + mmput(mm); + +- mutex_lock(&p->event_mutex); ++ pr_debug("notpresent %d, noexecute %d, readonly %d\n", ++ memory_exception_data.failure.NotPresent, ++ memory_exception_data.failure.NoExecute, ++ memory_exception_data.failure.ReadOnly); + +- /* Lookup events by type and signal them */ +- lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY, +- &memory_exception_data); ++ /* Workaround on Raven to not kill the process when memory is freed ++ * before IOMMU is able to finish processing all the excessive PPRs ++ * triggered due to HW flaws. ++ */ ++ if (dev->device_info->asic_family != CHIP_RAVEN) { ++ mutex_lock(&p->event_mutex); ++ ++ /* Lookup events by type and signal them */ ++ lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY, ++ &memory_exception_data); ++ ++ mutex_unlock(&p->event_mutex); ++ } + +- mutex_unlock(&p->event_mutex); + kfd_unref_process(p); + } ++#endif /* KFD_SUPPORT_IOMMU_V2 */ + + void kfd_signal_hw_exception_event(unsigned int pasid) + { +@@ -926,3 +967,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid) + mutex_unlock(&p->event_mutex); + kfd_unref_process(p); + } ++ ++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ++ struct kfd_vm_fault_info *info) ++{ ++ struct kfd_event *ev; ++ uint32_t id; ++ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); ++ struct kfd_hsa_memory_exception_data memory_exception_data; ++ ++ if (!p) ++ return; /* Presumably process exited. */ ++ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); ++ memory_exception_data.gpu_id = dev->id; ++ memory_exception_data.failure.imprecise = true; ++ /* Set failure reason */ ++ if (info) { ++ memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; ++ memory_exception_data.failure.NotPresent = ++ info->prot_valid ? 1 : 0; ++ memory_exception_data.failure.NoExecute = ++ info->prot_exec ? 1 : 0; ++ memory_exception_data.failure.ReadOnly = ++ info->prot_write ? 1 : 0; ++ memory_exception_data.failure.imprecise = 0; ++ } ++ mutex_lock(&p->event_mutex); ++ ++ id = KFD_FIRST_NONSIGNAL_EVENT_ID; ++ idr_for_each_entry_continue(&p->event_idr, ev, id) ++ if (ev->type == KFD_EVENT_TYPE_MEMORY) { ++ ev->memory_exception_data = memory_exception_data; ++ set_event(ev); ++ } ++ ++ mutex_unlock(&p->event_mutex); ++ kfd_unref_process(p); ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +index 7377513..2c00711 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +@@ -275,24 +275,80 @@ + * for FLAT_* / S_LOAD operations. + */ + +-#define MAKE_GPUVM_APP_BASE(gpu_num) \ ++#define MAKE_GPUVM_APP_BASE_VI(gpu_num) \ + (((uint64_t)(gpu_num) << 61) + 0x1000000000000L) + +-#define MAKE_GPUVM_APP_LIMIT(base) \ +- (((uint64_t)(base) & \ +- 0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL) ++#define MAKE_GPUVM_APP_LIMIT(base, size) \ ++ (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1) + +-#define MAKE_SCRATCH_APP_BASE(gpu_num) \ +- (((uint64_t)(gpu_num) << 61) + 0x100000000L) ++#define MAKE_SCRATCH_APP_BASE_VI() \ ++ (((uint64_t)(0x1UL) << 61) + 0x100000000L) + + #define MAKE_SCRATCH_APP_LIMIT(base) \ + (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + +-#define MAKE_LDS_APP_BASE(gpu_num) \ +- (((uint64_t)(gpu_num) << 61) + 0x0) ++#define MAKE_LDS_APP_BASE_VI() \ ++ (((uint64_t)(0x1UL) << 61) + 0x0) ++ + #define MAKE_LDS_APP_LIMIT(base) \ + (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) + ++/* On GFXv9 the LDS and scratch apertures are programmed independently ++ * using the high 16 bits of the 64-bit virtual address. They must be ++ * in the hole, which will be the case as long as the high 16 bits are ++ * not 0. ++ * ++ * The aperture sizes are still 4GB implicitly. ++ * ++ * A GPUVM aperture is not applicable on GFXv9. ++ */ ++#define MAKE_LDS_APP_BASE_V9() ((uint64_t)(0x1UL) << 48) ++#define MAKE_SCRATCH_APP_BASE_V9() ((uint64_t)(0x2UL) << 48) ++ ++/* User mode manages most of the SVM aperture address space. The low ++ * 16MB are reserved for kernel use (CWSR trap handler and kernel IB ++ * for now). ++ */ ++#define SVM_USER_BASE 0x1000000ull ++#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) ++#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) ++ ++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, ++ uint64_t base, uint64_t limit) ++{ ++ if (base < SVM_USER_BASE) { ++ pr_err("Set dgpu vm base 0x%llx failed.\n", base); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) ++{ ++ /* ++ * node id couldn't be 0 - the three MSB bits of ++ * aperture shoudn't be 0 ++ */ ++ pdd->lds_base = MAKE_LDS_APP_BASE_VI(); ++ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); ++ ++ pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); ++ pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( ++ pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); ++ ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); ++ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++} ++ ++void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) ++{ ++ pdd->lds_base = MAKE_LDS_APP_BASE_V9(); ++ pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); ++ ++ pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); ++ pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++} ++ + int kfd_init_apertures(struct kfd_process *process) + { + uint8_t id = 0; +@@ -300,9 +356,7 @@ int kfd_init_apertures(struct kfd_process *process) + struct kfd_process_device *pdd; + + /*Iterating over all devices*/ +- while (kfd_topology_enum_kfd_devices(id, &dev) == 0 && +- id < NUM_OF_SUPPORTED_GPUS) { +- ++ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { + if (!dev) { + id++; /* Skip non GPU devices */ + continue; +@@ -323,23 +377,35 @@ int kfd_init_apertures(struct kfd_process *process) + pdd->gpuvm_base = pdd->gpuvm_limit = 0; + pdd->scratch_base = pdd->scratch_limit = 0; + } else { +- /* +- * node id couldn't be 0 - the three MSB bits of +- * aperture shoudn't be 0 +- */ +- pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); +- +- pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); +- +- pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); +- +- pdd->gpuvm_limit = +- MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base); ++ switch (dev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ case CHIP_HAWAII: ++ case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: ++ kfd_init_apertures_vi(pdd, id); ++ break; ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ kfd_init_apertures_v9(pdd, id); ++ break; ++ default: ++ pr_err("Unknown chip in kfd_init_apertures\n"); ++ return -1; ++ } + +- pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1); +- +- pdd->scratch_limit = +- MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); ++ if (!dev->device_info->needs_iommu_device) { ++ /* dGPUs: SVM aperture starting at 0 ++ * with small reserved space for kernel ++ */ ++ pdd->gpuvm_base = SVM_USER_BASE; ++ pdd->gpuvm_limit = ++ dev->shared_resources.gpuvm_size - 1; ++ pdd->qpd.cwsr_base = SVM_CWSR_BASE; ++ pdd->qpd.ib_base = SVM_IB_BASE; ++ } + } + + dev_dbg(kfd_device, "node id %u\n", id); +@@ -356,5 +422,3 @@ int kfd_init_apertures(struct kfd_process *process) + + return 0; + } +- +- +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +new file mode 100644 +index 0000000..009d6f4 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +@@ -0,0 +1,135 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "kfd_priv.h" ++#include "kfd_events.h" ++#include "soc15_int.h" ++ ++ ++static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) ++{ ++ uint32_t pasid = 0; ++ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; ++ ++ if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) ++ pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); ++ ++ return pasid; ++} ++ ++static bool event_interrupt_isr_v9(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry, ++ uint32_t *patched_ihre, ++ bool *patched_flag) ++{ ++ uint16_t source_id, client_id, pasid, vmid; ++ bool result = false; ++ ++ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); ++ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); ++ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); ++ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); ++ ++ if (pasid) { ++ const uint32_t *data = ih_ring_entry; ++ ++ pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", ++ client_id, source_id, pasid); ++ pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", ++ data[0], data[1], data[2], data[3], ++ data[4], data[5], data[6], data[7]); ++ } ++ ++ if ((vmid >= dev->vm_info.first_vmid_kfd && ++ vmid <= dev->vm_info.last_vmid_kfd) && ++ (source_id == SOC15_INTSRC_CP_END_OF_PIPE || ++ source_id == SOC15_INTSRC_SDMA_TRAP || ++ source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || ++ source_id == SOC15_INTSRC_CP_BAD_OPCODE || ++ client_id == SOC15_IH_CLIENTID_VMC || ++ client_id == SOC15_IH_CLIENTID_UTCL2)) { ++ ++ /* ++ * KFD want to handle this INT, but MEC firmware did ++ * not send pasid. Try to get it from vmid mapping ++ * and patch the ih entry. It's a temp workaround. ++ */ ++ WARN_ONCE((!pasid), "Fix me.\n"); ++ if (!pasid) { ++ uint32_t temp = le32_to_cpu(ih_ring_entry[3]); ++ ++ pasid = kfd_get_pasid_from_vmid(dev, vmid); ++ memcpy(patched_ihre, ih_ring_entry, ++ dev->device_info->ih_ring_entry_size); ++ patched_ihre[3] = cpu_to_le32(temp | pasid); ++ *patched_flag = true; ++ } ++ result = pasid ? true : false; ++ } ++ ++ /* Do not process in ISR, just request it to be forwarded to WQ. */ ++ return result; ++ ++} ++ ++static void event_interrupt_wq_v9(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry) ++{ ++ uint16_t source_id, client_id, pasid, vmid; ++ uint32_t context_id; ++ ++ source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); ++ client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); ++ pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); ++ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); ++ context_id = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); ++ ++ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) ++ kfd_signal_event_interrupt(pasid, context_id, 32); ++ else if (source_id == SOC15_INTSRC_SDMA_TRAP) ++ kfd_signal_event_interrupt(pasid, context_id & 0xfffffff, 28); ++ else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) ++ kfd_signal_event_interrupt(pasid, context_id & 0xffffff, 24); ++ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) ++ kfd_signal_hw_exception_event(pasid); ++ else if (client_id == SOC15_IH_CLIENTID_VMC || ++ client_id == SOC15_IH_CLIENTID_UTCL2) { ++ struct kfd_vm_fault_info info = {0}; ++ uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); ++ ++ info.vmid = vmid; ++ info.mc_id = client_id; ++ info.page_addr = ih_ring_entry[4] | ++ (uint64_t)(ih_ring_entry[5] & 0xf) << 32; ++ info.prot_valid = ring_id & 0x08; ++ info.prot_read = ring_id & 0x10; ++ info.prot_write = ring_id & 0x20; ++ ++ kfd_process_vm_fault(dev->dqm, pasid); ++ kfd_signal_vm_fault_event(dev, pasid, &info); ++ } ++} ++ ++const struct kfd_event_interrupt_class event_interrupt_class_v9 = { ++ .interrupt_isr = event_interrupt_isr_v9, ++ .interrupt_wq = event_interrupt_wq_v9, ++}; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +index 035c351..cda36c8 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +@@ -149,13 +149,15 @@ static void interrupt_wq(struct work_struct *work) + ih_ring_entry); + } + +-bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry) ++bool interrupt_is_wanted(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry, ++ uint32_t *patched_ihre, bool *flag) + { + /* integer and bitwise OR so there is no boolean short-circuiting */ + unsigned int wanted = 0; + + wanted |= dev->device_info->event_interrupt_class->interrupt_isr(dev, +- ih_ring_entry); ++ ih_ring_entry, patched_ihre, flag); + + return wanted != 0; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +new file mode 100644 +index 0000000..5b798f9 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +@@ -0,0 +1,356 @@ ++/* ++ * Copyright 2018 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/printk.h> ++#include <linux/device.h> ++#include <linux/slab.h> ++#include <linux/pci.h> ++#include <linux/amd-iommu.h> ++#include "kfd_priv.h" ++#include "kfd_dbgmgr.h" ++#include "kfd_topology.h" ++#include "kfd_iommu.h" ++ ++static const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PRI_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PASID_SUP; ++ ++/** kfd_iommu_check_device - Check whether IOMMU is available for device ++ */ ++int kfd_iommu_check_device(struct kfd_dev *kfd) ++{ ++ struct amd_iommu_device_info iommu_info; ++ int err; ++ ++ if (!kfd->device_info->needs_iommu_device) ++ return -ENODEV; ++ ++ iommu_info.flags = 0; ++ err = amd_iommu_device_info(kfd->pdev, &iommu_info); ++ if (err) ++ return err; ++ ++ if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) ++ return -ENODEV; ++ ++ return 0; ++} ++ ++/** kfd_iommu_device_init - Initialize IOMMU for device ++ */ ++int kfd_iommu_device_init(struct kfd_dev *kfd) ++{ ++ struct amd_iommu_device_info iommu_info; ++ unsigned int pasid_limit; ++ int err; ++ ++ if (!kfd->device_info->needs_iommu_device) ++ return 0; ++ ++ iommu_info.flags = 0; ++ err = amd_iommu_device_info(kfd->pdev, &iommu_info); ++ if (err < 0) { ++ dev_err(kfd_device, ++ "error getting iommu info. is the iommu enabled?\n"); ++ return -ENODEV; ++ } ++ ++ if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { ++ dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", ++ (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, ++ (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, ++ (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) ++ != 0); ++ return -ENODEV; ++ } ++ ++ pasid_limit = min_t(unsigned int, ++ (unsigned int)(1 << kfd->device_info->max_pasid_bits), ++ iommu_info.max_pasids); ++ ++ if (!kfd_set_pasid_limit(pasid_limit)) { ++ dev_err(kfd_device, "error setting pasid limit\n"); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++/** kfd_iommu_bind_process_to_device - Have the IOMMU bind a process ++ * ++ * Binds the given process to the given device using its PASID. This ++ * enables IOMMUv2 address translation for the process on the device. ++ * ++ * This function assumes that the process mutex is held. ++ */ ++int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd) ++{ ++ struct kfd_dev *dev = pdd->dev; ++ struct kfd_process *p = pdd->process; ++ int err; ++ ++ if (!dev->device_info->needs_iommu_device || pdd->bound == PDD_BOUND) ++ return 0; ++ ++ if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) { ++ pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); ++ return -EINVAL; ++ } ++ ++ err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); ++ if (!err) ++ pdd->bound = PDD_BOUND; ++ ++ return err; ++} ++ ++/** kfd_iommu_unbind_process - Unbind process from all devices ++ * ++ * This removes all IOMMU device bindings of the process. To be used ++ * before process termination. ++ */ ++void kfd_iommu_unbind_process(struct kfd_process *p) ++{ ++ struct kfd_process_device *pdd; ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) ++ if (pdd->bound == PDD_BOUND) ++ amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); ++} ++ ++/* Callback for process shutdown invoked by the IOMMU driver */ ++static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) ++{ ++ struct kfd_dev *dev = kfd_device_by_pci_dev(pdev); ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ ++ if (!dev) ++ return; ++ ++ /* ++ * Look for the process that matches the pasid. If there is no such ++ * process, we either released it in amdkfd's own notifier, or there ++ * is a bug. Unfortunately, there is no way to tell... ++ */ ++ p = kfd_lookup_process_by_pasid(pasid); ++ if (!p) ++ return; ++ ++ pr_debug("Unbinding process %d from IOMMU\n", pasid); ++ ++ mutex_lock(kfd_get_dbgmgr_mutex()); ++ ++ if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { ++ if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { ++ kfd_dbgmgr_destroy(dev->dbgmgr); ++ dev->dbgmgr = NULL; ++ } ++ } ++ ++ mutex_unlock(kfd_get_dbgmgr_mutex()); ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_get_process_device_data(dev, p); ++ if (pdd) ++ /* For GPU relying on IOMMU, we need to dequeue here ++ * when PASID is still bound. ++ */ ++ kfd_process_dequeue_from_device(pdd); ++ ++ mutex_unlock(&p->mutex); ++ ++ kfd_unref_process(p); ++} ++ ++/* This function called by IOMMU driver on PPR failure */ ++static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, ++ unsigned long address, u16 flags) ++{ ++ struct kfd_dev *dev; ++ ++ dev_warn_ratelimited(kfd_device, ++ "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X", ++ PCI_BUS_NUM(pdev->devfn), ++ PCI_SLOT(pdev->devfn), ++ PCI_FUNC(pdev->devfn), ++ pasid, ++ address, ++ flags); ++ ++ dev = kfd_device_by_pci_dev(pdev); ++ if (!WARN_ON(!dev)) ++ kfd_signal_iommu_event(dev, pasid, address, ++ flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC); ++ ++ return AMD_IOMMU_INV_PRI_RSP_INVALID; ++} ++ ++/* ++ * Bind processes do the device that have been temporarily unbound ++ * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device. ++ */ ++static int kfd_bind_processes_to_device(struct kfd_dev *kfd) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p; ++ unsigned int temp; ++ int err = 0; ++ ++ int idx = srcu_read_lock(&kfd_processes_srcu); ++ ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ mutex_lock(&p->mutex); ++ pdd = kfd_get_process_device_data(kfd, p); ++ ++ if (WARN_ON(!pdd) || pdd->bound != PDD_BOUND_SUSPENDED) { ++ mutex_unlock(&p->mutex); ++ continue; ++ } ++ ++ err = amd_iommu_bind_pasid(kfd->pdev, p->pasid, ++ p->lead_thread); ++ if (err < 0) { ++ pr_err("Unexpected pasid %d binding failure\n", ++ p->pasid); ++ mutex_unlock(&p->mutex); ++ break; ++ } ++ ++ pdd->bound = PDD_BOUND; ++ mutex_unlock(&p->mutex); ++ } ++ ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++ ++ return err; ++} ++ ++/* ++ * Mark currently bound processes as PDD_BOUND_SUSPENDED. These ++ * processes will be restored to PDD_BOUND state in ++ * kfd_bind_processes_to_device. ++ */ ++static void kfd_unbind_processes_from_device(struct kfd_dev *kfd) ++{ ++ struct kfd_process_device *pdd; ++ struct kfd_process *p; ++ unsigned int temp; ++ ++ int idx = srcu_read_lock(&kfd_processes_srcu); ++ ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ mutex_lock(&p->mutex); ++ pdd = kfd_get_process_device_data(kfd, p); ++ ++ if (WARN_ON(!pdd)) { ++ mutex_unlock(&p->mutex); ++ continue; ++ } ++ ++ if (pdd->bound == PDD_BOUND) ++ pdd->bound = PDD_BOUND_SUSPENDED; ++ mutex_unlock(&p->mutex); ++ } ++ ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++} ++ ++/** kfd_iommu_suspend - Prepare IOMMU for suspend ++ * ++ * This unbinds processes from the device and disables the IOMMU for ++ * the device. ++ */ ++void kfd_iommu_suspend(struct kfd_dev *kfd) ++{ ++ if (!kfd->device_info->needs_iommu_device) ++ return; ++ ++ kfd_unbind_processes_from_device(kfd); ++ ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); ++ amd_iommu_free_device(kfd->pdev); ++} ++ ++/** kfd_iommu_resume - Restore IOMMU after resume ++ * ++ * This reinitializes the IOMMU for the device and re-binds previously ++ * suspended processes to the device. ++ */ ++int kfd_iommu_resume(struct kfd_dev *kfd) ++{ ++ unsigned int pasid_limit; ++ int err; ++ ++ if (!kfd->device_info->needs_iommu_device) ++ return 0; ++ ++ pasid_limit = kfd_get_pasid_limit(); ++ ++ err = amd_iommu_init_device(kfd->pdev, pasid_limit); ++ if (err) ++ return -ENXIO; ++ ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, ++ iommu_pasid_shutdown_callback); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, ++ iommu_invalid_ppr_cb); ++ ++ err = kfd_bind_processes_to_device(kfd); ++ if (err) { ++ amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL); ++ amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL); ++ amd_iommu_free_device(kfd->pdev); ++ return err; ++ } ++ ++ return 0; ++} ++ ++extern bool amd_iommu_pc_supported(void); ++extern u8 amd_iommu_pc_get_max_banks(u16 devid); ++extern u8 amd_iommu_pc_get_max_counters(u16 devid); ++ ++/** kfd_iommu_add_perf_counters - Add IOMMU performance counters to topology ++ */ ++int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev) ++{ ++ struct kfd_perf_properties *props; ++ ++ if (!(kdev->node_props.capability & HSA_CAP_ATS_PRESENT)) ++ return 0; ++ ++ if (!amd_iommu_pc_supported()) ++ return 0; ++ ++ props = kfd_alloc_struct(props); ++ if (!props) ++ return -ENOMEM; ++ strcpy(props->block_name, "iommu"); ++ props->max_concurrent = amd_iommu_pc_get_max_banks(0) * ++ amd_iommu_pc_get_max_counters(0); /* assume one iommu */ ++ list_add_tail(&props->list, &kdev->perf_props); ++ ++ return 0; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h +new file mode 100644 +index 0000000..dd23d9f +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.h +@@ -0,0 +1,78 @@ ++/* ++ * Copyright 2018 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef __KFD_IOMMU_H__ ++#define __KFD_IOMMU_H__ ++ ++#if defined(CONFIG_AMD_IOMMU_V2_MODULE) || defined(CONFIG_AMD_IOMMU_V2) ++ ++#define KFD_SUPPORT_IOMMU_V2 ++ ++int kfd_iommu_check_device(struct kfd_dev *kfd); ++int kfd_iommu_device_init(struct kfd_dev *kfd); ++ ++int kfd_iommu_bind_process_to_device(struct kfd_process_device *pdd); ++void kfd_iommu_unbind_process(struct kfd_process *p); ++ ++void kfd_iommu_suspend(struct kfd_dev *kfd); ++int kfd_iommu_resume(struct kfd_dev *kfd); ++ ++int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev); ++ ++#else ++ ++static inline int kfd_iommu_check_device(struct kfd_dev *kfd) ++{ ++ return -ENODEV; ++} ++static inline int kfd_iommu_device_init(struct kfd_dev *kfd) ++{ ++ return 0; ++} ++ ++static inline int kfd_iommu_bind_process_to_device( ++ struct kfd_process_device *pdd) ++{ ++ return 0; ++} ++static inline void kfd_iommu_unbind_process(struct kfd_process *p) ++{ ++ /* empty */ ++} ++ ++static inline void kfd_iommu_suspend(struct kfd_dev *kfd) ++{ ++ /* empty */ ++} ++static inline int kfd_iommu_resume(struct kfd_dev *kfd) ++{ ++ return 0; ++} ++ ++static inline int kfd_iommu_add_perf_counters(struct kfd_topology_device *kdev) ++{ ++ return 0; ++} ++ ++#endif /* defined(CONFIG_AMD_IOMMU_V2) */ ++ ++#endif /* __KFD_IOMMU_H__ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +new file mode 100644 +index 0000000..97806ed +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +@@ -0,0 +1,270 @@ ++/* ++ * Copyright 2014 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/dma-buf.h> ++#include <linux/slab.h> ++#include <linux/random.h> ++ ++#include "kfd_ipc.h" ++#include "kfd_priv.h" ++ ++#define KFD_IPC_HASH_TABLE_SIZE_SHIFT 4 ++#define KFD_IPC_HASH_TABLE_SIZE_MASK ((1 << KFD_IPC_HASH_TABLE_SIZE_SHIFT) - 1) ++ ++static struct kfd_ipc_handles { ++ DECLARE_HASHTABLE(handles, KFD_IPC_HASH_TABLE_SIZE_SHIFT); ++ struct mutex lock; ++} kfd_ipc_handles; ++ ++/* Since, handles are random numbers, it can be used directly as hashing key. ++ * The least 4 bits of the handle are used as key. However, during import all ++ * 128 bits of the handle are checked to prevent handle snooping. ++ */ ++#define HANDLE_TO_KEY(sh) ((*(uint64_t *)sh) & KFD_IPC_HASH_TABLE_SIZE_MASK) ++ ++static int ipc_store_insert(void *val, void *sh, struct kfd_ipc_obj **ipc_obj) ++{ ++ struct kfd_ipc_obj *obj; ++ ++ obj = kmalloc(sizeof(*obj), GFP_KERNEL); ++ if (!obj) ++ return -ENOMEM; ++ ++ /* The initial ref belongs to the allocator process. ++ * The IPC object store itself does not hold a ref since ++ * there is no specific moment in time where that ref should ++ * be dropped, except "when there are no more userspace processes ++ * holding a ref to the object". Therefore the removal from IPC ++ * storage happens at ipc_obj release time. ++ */ ++ kref_init(&obj->ref); ++ obj->data = val; ++ get_random_bytes(obj->share_handle, sizeof(obj->share_handle)); ++ ++ memcpy(sh, obj->share_handle, sizeof(obj->share_handle)); ++ ++ mutex_lock(&kfd_ipc_handles.lock); ++ hlist_add_head(&obj->node, ++ &kfd_ipc_handles.handles[HANDLE_TO_KEY(obj->share_handle)]); ++ mutex_unlock(&kfd_ipc_handles.lock); ++ ++ if (ipc_obj) ++ *ipc_obj = obj; ++ ++ return 0; ++} ++ ++static void ipc_obj_release(struct kref *r) ++{ ++ struct kfd_ipc_obj *obj; ++ ++ obj = container_of(r, struct kfd_ipc_obj, ref); ++ ++ mutex_lock(&kfd_ipc_handles.lock); ++ hash_del(&obj->node); ++ mutex_unlock(&kfd_ipc_handles.lock); ++ ++ dma_buf_put(obj->data); ++ kfree(obj); ++} ++ ++void ipc_obj_get(struct kfd_ipc_obj *obj) ++{ ++ kref_get(&obj->ref); ++} ++ ++void ipc_obj_put(struct kfd_ipc_obj **obj) ++{ ++ kref_put(&(*obj)->ref, ipc_obj_release); ++ *obj = NULL; ++} ++ ++int kfd_ipc_init(void) ++{ ++ mutex_init(&kfd_ipc_handles.lock); ++ hash_init(kfd_ipc_handles.handles); ++ return 0; ++} ++ ++static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, ++ struct kfd_process *p, ++ uint32_t gpu_id, struct dma_buf *dmabuf, ++ uint64_t va_addr, uint64_t *handle, ++ uint64_t *mmap_offset, ++ struct kfd_ipc_obj *ipc_obj) ++{ ++ int r; ++ void *mem; ++ uint64_t size; ++ int idr_handle; ++ struct kfd_process_device *pdd = NULL; ++ ++ if (!handle) ++ return -EINVAL; ++ ++ if (!dev || !dev->kfd2kgd->import_dmabuf) ++ return -EINVAL; ++ ++ mutex_lock(&p->mutex); ++ ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ r = PTR_ERR(pdd); ++ goto err_unlock; ++ } ++ ++ r = dev->kfd2kgd->import_dmabuf(dev->kgd, dmabuf, ++ va_addr, pdd->vm, ++ (struct kgd_mem **)&mem, &size, ++ mmap_offset); ++ if (r) ++ goto err_unlock; ++ ++ idr_handle = kfd_process_device_create_obj_handle(pdd, mem, ++ va_addr, size, ++ ipc_obj); ++ if (idr_handle < 0) { ++ r = -EFAULT; ++ goto err_free; ++ } ++ ++ mutex_unlock(&p->mutex); ++ ++ *handle = MAKE_HANDLE(gpu_id, idr_handle); ++ ++ return 0; ++ ++err_free: ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, ++ (struct kgd_mem *)mem); ++err_unlock: ++ mutex_unlock(&p->mutex); ++ return r; ++} ++ ++int kfd_ipc_import_dmabuf(struct kfd_dev *dev, ++ struct kfd_process *p, ++ uint32_t gpu_id, int dmabuf_fd, ++ uint64_t va_addr, uint64_t *handle, ++ uint64_t *mmap_offset) ++{ ++ int r; ++ struct dma_buf *dmabuf = dma_buf_get(dmabuf_fd); ++ ++ if (!dmabuf) ++ return -EINVAL; ++ ++ r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, dmabuf, ++ va_addr, handle, mmap_offset, ++ NULL); ++ dma_buf_put(dmabuf); ++ return r; ++} ++ ++int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, ++ uint32_t gpu_id, uint32_t *share_handle, ++ uint64_t va_addr, uint64_t *handle, ++ uint64_t *mmap_offset) ++{ ++ int r; ++ struct kfd_ipc_obj *entry, *found = NULL; ++ ++ mutex_lock(&kfd_ipc_handles.lock); ++ /* Convert the user provided handle to hash key and search only in that ++ * bucket ++ */ ++ hlist_for_each_entry(entry, ++ &kfd_ipc_handles.handles[HANDLE_TO_KEY(share_handle)], node) { ++ if (!memcmp(entry->share_handle, share_handle, ++ sizeof(entry->share_handle))) { ++ found = entry; ++ break; ++ } ++ } ++ mutex_unlock(&kfd_ipc_handles.lock); ++ ++ if (!found) ++ return -EINVAL; ++ ipc_obj_get(found); ++ ++ pr_debug("Found ipc_dma_buf: %p\n", found->data); ++ ++ r = kfd_import_dmabuf_create_kfd_bo(dev, p, gpu_id, found->data, ++ va_addr, handle, mmap_offset, ++ found); ++ if (r) ++ goto error_unref; ++ ++ return r; ++ ++error_unref: ++ ipc_obj_put(&found); ++ return r; ++} ++ ++int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, ++ uint64_t handle, uint32_t *ipc_handle) ++{ ++ struct kfd_process_device *pdd = NULL; ++ struct kfd_ipc_obj *obj; ++ struct kfd_bo *kfd_bo = NULL; ++ struct dma_buf *dmabuf; ++ int r; ++ ++ if (!dev || !ipc_handle) ++ return -EINVAL; ++ ++ mutex_lock(&p->mutex); ++ pdd = kfd_bind_process_to_device(dev, p); ++ if (IS_ERR(pdd)) { ++ mutex_unlock(&p->mutex); ++ pr_err("Failed to get pdd\n"); ++ return PTR_ERR(pdd); ++ } ++ ++ kfd_bo = kfd_process_device_find_bo(pdd, GET_IDR_HANDLE(handle)); ++ mutex_unlock(&p->mutex); ++ ++ if (!kfd_bo) { ++ pr_err("Failed to get bo"); ++ return -EINVAL; ++ } ++ if (kfd_bo->kfd_ipc_obj) { ++ memcpy(ipc_handle, kfd_bo->kfd_ipc_obj->share_handle, ++ sizeof(kfd_bo->kfd_ipc_obj->share_handle)); ++ return 0; ++ } ++ ++ r = dev->kfd2kgd->export_dmabuf(dev->kgd, pdd->vm, ++ (struct kgd_mem *)kfd_bo->mem, ++ &dmabuf); ++ if (r) ++ return r; ++ ++ r = ipc_store_insert(dmabuf, ipc_handle, &obj); ++ if (r) ++ return r; ++ ++ kfd_bo->kfd_ipc_obj = obj; ++ ++ return r; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h +new file mode 100644 +index 0000000..9ee8627 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.h +@@ -0,0 +1,51 @@ ++/* ++ * Copyright 2014 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#ifndef KFD_IPC_H_ ++#define KFD_IPC_H_ ++ ++#include <linux/types.h> ++#include "kfd_priv.h" ++ ++struct kfd_ipc_obj { ++ struct hlist_node node; ++ struct kref ref; ++ void *data; ++ uint32_t share_handle[4]; ++}; ++ ++int kfd_ipc_import_handle(struct kfd_dev *dev, struct kfd_process *p, ++ uint32_t gpu_id, uint32_t *share_handle, ++ uint64_t va_addr, uint64_t *handle, ++ uint64_t *mmap_offset); ++int kfd_ipc_import_dmabuf(struct kfd_dev *kfd, struct kfd_process *p, ++ uint32_t gpu_id, int dmabuf_fd, ++ uint64_t va_addr, uint64_t *handle, ++ uint64_t *mmap_offset); ++int kfd_ipc_export_as_handle(struct kfd_dev *dev, struct kfd_process *p, ++ uint64_t handle, uint32_t *ipc_handle); ++ ++void ipc_obj_get(struct kfd_ipc_obj *obj); ++void ipc_obj_put(struct kfd_ipc_obj **obj); ++ ++#endif /* KFD_IPC_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index 5dc6567..8cf9d44 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -99,7 +99,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + kq->rptr_kernel = kq->rptr_mem->cpu_ptr; + kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr; + +- retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel), ++ retval = kfd_gtt_sa_allocate(dev, dev->device_info->doorbell_size, + &kq->wptr_mem); + + if (retval != 0) +@@ -123,6 +123,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + prop.write_ptr = (uint32_t *) kq->wptr_gpu_addr; + prop.eop_ring_buffer_address = kq->eop_gpu_addr; + prop.eop_ring_buffer_size = PAGE_SIZE; ++ prop.cu_mask = NULL; + + if (init_queue(&kq->queue, &prop) != 0) + goto err_init_queue; +@@ -208,6 +209,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + size_t available_size; + size_t queue_size_dwords; + uint32_t wptr, rptr; ++ uint64_t wptr64; + unsigned int *queue_address; + + /* When rptr == wptr, the buffer is empty. +@@ -216,7 +218,8 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + * the opposite. So we can only use up to queue_size_dwords - 1 dwords. + */ + rptr = *kq->rptr_kernel; +- wptr = *kq->wptr_kernel; ++ wptr = kq->pending_wptr; ++ wptr64 = kq->pending_wptr64; + queue_address = (unsigned int *)kq->pq_kernel_addr; + queue_size_dwords = kq->queue->properties.queue_size / 4; + +@@ -232,27 +235,64 @@ static int acquire_packet_buffer(struct kernel_queue *kq, + * make sure calling functions know + * acquire_packet_buffer() failed + */ +- *buffer_ptr = NULL; +- return -ENOMEM; ++ goto err_no_space; + } + + if (wptr + packet_size_in_dwords >= queue_size_dwords) { + /* make sure after rolling back to position 0, there is + * still enough space. + */ +- if (packet_size_in_dwords >= rptr) { +- *buffer_ptr = NULL; +- return -ENOMEM; +- } ++ if (packet_size_in_dwords >= rptr) ++ goto err_no_space; ++ + /* fill nops, roll back and start at position 0 */ + while (wptr > 0) { + queue_address[wptr] = kq->nop_packet; + wptr = (wptr + 1) % queue_size_dwords; ++ wptr64++; + } + } + + *buffer_ptr = &queue_address[wptr]; + kq->pending_wptr = wptr + packet_size_in_dwords; ++ kq->pending_wptr64 = wptr64 + packet_size_in_dwords; ++ ++ return 0; ++ ++err_no_space: ++ *buffer_ptr = NULL; ++ return -ENOMEM; ++} ++ ++static int acquire_inline_ib(struct kernel_queue *kq, ++ size_t size_in_dwords, ++ unsigned int **buffer_ptr, ++ uint64_t *gpu_addr) ++{ ++ int ret; ++ unsigned int *buf; ++ union PM4_MES_TYPE_3_HEADER nop; ++ ++ if (size_in_dwords >= (1 << 14)) ++ return -EINVAL; ++ ++ /* Allocate size_in_dwords on the ring, plus an extra dword ++ * for a NOP packet header ++ */ ++ ret = acquire_packet_buffer(kq, size_in_dwords + 1, &buf); ++ if (ret) ++ return ret; ++ ++ /* Build a NOP packet that contains the IB as "payload". */ ++ nop.u32all = 0; ++ nop.opcode = IT_NOP; ++ nop.count = size_in_dwords - 1; ++ nop.type = PM4_TYPE_3; ++ ++ *buf = nop.u32all; ++ *buffer_ptr = buf + 1; ++ *gpu_addr = kq->pq_gpu_addr + ((unsigned long)*buffer_ptr - ++ (unsigned long)kq->pq_kernel_addr); + + return 0; + } +@@ -270,9 +310,7 @@ static void submit_packet(struct kernel_queue *kq) + pr_debug("\n"); + #endif + +- *kq->wptr_kernel = kq->pending_wptr; +- write_kernel_doorbell(kq->queue->properties.doorbell_ptr, +- kq->pending_wptr); ++ kq->ops_asic_specific.submit_packet(kq); + } + + static void rollback_packet(struct kernel_queue *kq) +@@ -292,17 +330,28 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + kq->ops.initialize = initialize; + kq->ops.uninitialize = uninitialize; + kq->ops.acquire_packet_buffer = acquire_packet_buffer; ++ kq->ops.acquire_inline_ib = acquire_inline_ib; + kq->ops.submit_packet = submit_packet; + kq->ops.rollback_packet = rollback_packet; + + switch (dev->device_info->asic_family) { + case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: + kernel_queue_init_vi(&kq->ops_asic_specific); + break; + + case CHIP_KAVERI: ++ case CHIP_HAWAII: + kernel_queue_init_cik(&kq->ops_asic_specific); + break; ++ ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ kernel_queue_init_v9(&kq->ops_asic_specific); ++ break; + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +index 5940531..82c94a6 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +@@ -42,6 +42,12 @@ + * pending write pointer to that location so subsequent calls to + * acquire_packet_buffer will get a correct write pointer + * ++ * @acquire_inline_ib: Returns a pointer to the location in the kernel ++ * queue ring buffer where the calling function can write an inline IB. It is ++ * Guaranteed that there is enough space for that IB. It also updates the ++ * pending write pointer to that location so subsequent calls to ++ * acquire_packet_buffer will get a correct write pointer ++ * + * @submit_packet: Update the write pointer and doorbell of a kernel queue. + * + * @sync_with_hw: Wait until the write pointer and the read pointer of a kernel +@@ -59,6 +65,10 @@ struct kernel_queue_ops { + int (*acquire_packet_buffer)(struct kernel_queue *kq, + size_t packet_size_in_dwords, + unsigned int **buffer_ptr); ++ int (*acquire_inline_ib)(struct kernel_queue *kq, ++ size_t packet_size_in_dwords, ++ unsigned int **buffer_ptr, ++ uint64_t *gpu_addr); + + void (*submit_packet)(struct kernel_queue *kq); + void (*rollback_packet)(struct kernel_queue *kq); +@@ -72,6 +82,7 @@ struct kernel_queue { + struct kfd_dev *dev; + struct mqd_manager *mqd; + struct queue *queue; ++ uint64_t pending_wptr64; + uint32_t pending_wptr; + unsigned int nop_packet; + +@@ -79,7 +90,10 @@ struct kernel_queue { + uint32_t *rptr_kernel; + uint64_t rptr_gpu_addr; + struct kfd_mem_obj *wptr_mem; +- uint32_t *wptr_kernel; ++ union { ++ uint64_t *wptr64_kernel; ++ uint32_t *wptr_kernel; ++ }; + uint64_t wptr_gpu_addr; + struct kfd_mem_obj *pq; + uint64_t pq_gpu_addr; +@@ -97,5 +111,6 @@ struct kernel_queue { + + void kernel_queue_init_cik(struct kernel_queue_ops *ops); + void kernel_queue_init_vi(struct kernel_queue_ops *ops); ++void kernel_queue_init_v9(struct kernel_queue_ops *ops); + + #endif /* KFD_KERNEL_QUEUE_H_ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +index a90eb44..2808422 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +@@ -22,15 +22,19 @@ + */ + + #include "kfd_kernel_queue.h" ++#include "kfd_pm4_headers.h" ++#include "kfd_pm4_opcodes.h" + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_cik(struct kernel_queue *kq); ++static void submit_packet_cik(struct kernel_queue *kq); + + void kernel_queue_init_cik(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_cik; + ops->uninitialize = uninitialize_cik; ++ ops->submit_packet = submit_packet_cik; + } + + static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -42,3 +46,127 @@ static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, + static void uninitialize_cik(struct kernel_queue *kq) + { + } ++ ++static void submit_packet_cik(struct kernel_queue *kq) ++{ ++ *kq->wptr_kernel = kq->pending_wptr; ++ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr); ++} ++ ++static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, ++ struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process *packet; ++ ++ packet = (struct pm4_map_process *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process)); ++ ++ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static int pm_map_process_scratch_cik(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_map_process_scratch_kv *packet; ++ ++ packet = (struct pm4_map_process_scratch_kv *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); ++ ++ packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_map_process_scratch_kv)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields14.gds_size = qpd->gds_size; ++ packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_oac = qpd->num_oac; ++ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++static uint32_t pm_get_map_process_packet_size_cik(void) ++{ ++ return sizeof(struct pm4_map_process); ++} ++static uint32_t pm_get_map_process_scratch_packet_size_cik(void) ++{ ++ return sizeof(struct pm4_map_process_scratch_kv); ++} ++ ++ ++static struct packet_manager_funcs kfd_cik_pm_funcs = { ++ .map_process = pm_map_process_cik, ++ .runlist = pm_runlist_vi, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_vi, ++ .unmap_queues = pm_unmap_queues_vi, ++ .query_status = pm_query_status_vi, ++ .release_mem = pm_release_mem_vi, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_cik, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, ++}; ++ ++static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { ++ .map_process = pm_map_process_scratch_cik, ++ .runlist = pm_runlist_vi, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_vi, ++ .unmap_queues = pm_unmap_queues_vi, ++ .query_status = pm_query_status_vi, ++ .release_mem = pm_release_mem_vi, ++ .get_map_process_packet_size = ++ pm_get_map_process_scratch_packet_size_cik, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, ++}; ++ ++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ if (fw_ver >= KFD_SCRATCH_KV_FW_VER) ++ pm->pmf = &kfd_cik_scratch_pm_funcs; ++ else ++ pm->pmf = &kfd_cik_pm_funcs; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +new file mode 100644 +index 0000000..5fe4f60 +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +@@ -0,0 +1,377 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include "kfd_kernel_queue.h" ++#include "kfd_device_queue_manager.h" ++#include "kfd_pm4_headers_ai.h" ++#include "kfd_pm4_opcodes.h" ++ ++static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, ++ enum kfd_queue_type type, unsigned int queue_size); ++static void uninitialize_v9(struct kernel_queue *kq); ++static void submit_packet_v9(struct kernel_queue *kq); ++ ++void kernel_queue_init_v9(struct kernel_queue_ops *ops) ++{ ++ ops->initialize = initialize_v9; ++ ops->uninitialize = uninitialize_v9; ++ ops->submit_packet = submit_packet_v9; ++} ++ ++static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, ++ enum kfd_queue_type type, unsigned int queue_size) ++{ ++ int retval; ++ ++ retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); ++ if (retval != 0) ++ return false; ++ ++ kq->eop_gpu_addr = kq->eop_mem->gpu_addr; ++ kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; ++ ++ memset(kq->eop_kernel_addr, 0, PAGE_SIZE); ++ ++ return true; ++} ++ ++static void uninitialize_v9(struct kernel_queue *kq) ++{ ++ kfd_gtt_sa_free(kq->dev, kq->eop_mem); ++} ++ ++static void submit_packet_v9(struct kernel_queue *kq) ++{ ++ *kq->wptr64_kernel = kq->pending_wptr64; ++ write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr64); ++} ++ ++static int pm_map_process_v9(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_mes_map_process *packet; ++ uint64_t vm_page_table_base_addr = ++ (uint64_t)(qpd->page_table_base) << 12; ++ ++ packet = (struct pm4_mes_map_process *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_mes_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields14.gds_size = qpd->gds_size; ++ packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_oac = qpd->num_oac; ++ packet->bitfields14.sdma_enable = 1; ++ packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); ++ packet->sq_shader_tba_hi = upper_32_bits(qpd->tba_addr >> 8); ++ packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); ++ packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ packet->vm_context_page_table_base_addr_lo32 = ++ lower_32_bits(vm_page_table_base_addr); ++ packet->vm_context_page_table_base_addr_hi32 = ++ upper_32_bits(vm_page_table_base_addr); ++ ++ return 0; ++} ++ ++static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain) ++{ ++ struct pm4_mes_runlist *packet; ++ ++ int concurrent_proc_cnt = 0; ++ struct kfd_dev *kfd = pm->dqm->dev; ++ ++ /* Determine the number of processes to map together to HW: ++ * it can not exceed the number of VMIDs available to the ++ * scheduler, and it is determined by the smaller of the number ++ * of processes in the runlist and kfd module parameter ++ * hws_max_conc_proc. ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ concurrent_proc_cnt = min(pm->dqm->processes_count, ++ kfd->max_proc_per_quantum); ++ ++ ++ packet = (struct pm4_mes_runlist *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); ++ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, ++ sizeof(struct pm4_mes_runlist)); ++ ++ packet->bitfields4.ib_size = ib_size_in_dwords; ++ packet->bitfields4.chain = chain ? 1 : 0; ++ packet->bitfields4.offload_polling = 0; ++ packet->bitfields4.valid = 1; ++ packet->bitfields4.process_cnt = concurrent_proc_cnt; ++ packet->ordinal2 = lower_32_bits(ib); ++ packet->ib_base_hi = upper_32_bits(ib); ++ ++ return 0; ++} ++ ++static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_mes_map_queues *packet; ++ bool use_static = is_static; ++ ++ packet = (struct pm4_mes_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_mes_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe_vi; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; ++ ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute_vi; ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_compute_vi; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ if (use_static) ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_latency_static_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__debug_interface_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + ++ engine_sel__mes_map_queues__sdma0_vi; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ WARN(1, "queue type %d", q->properties.type); ++ return -EINVAL; ++ } ++ packet->bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} ++ ++static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine) ++{ ++ struct pm4_mes_unmap_queues *packet; ++ ++ packet = (struct pm4_mes_unmap_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, ++ sizeof(struct pm4_mes_unmap_queues)); ++ switch (type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; ++ break; ++ default: ++ WARN(1, "queue type %d", type); ++ return -EINVAL; ++ } ++ ++ if (reset) ++ packet->bitfields2.action = ++ action__mes_unmap_queues__reset_queues; ++ else ++ packet->bitfields2.action = ++ action__mes_unmap_queues__preempt_queues; ++ ++ switch (filter) { ++ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields3b.doorbell_offset0 = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; ++ packet->bitfields3a.pasid = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_queues; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: ++ /* in this case, we do not preempt static queues */ ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; ++ break; ++ default: ++ WARN(1, "filter %d", filter); ++ return -EINVAL; ++ } ++ ++ return 0; ++ ++} ++ ++static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value) ++{ ++ struct pm4_mes_query_status *packet; ++ ++ packet = (struct pm4_mes_query_status *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); ++ ++ ++ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, ++ sizeof(struct pm4_mes_query_status)); ++ ++ packet->bitfields2.context_id = 0; ++ packet->bitfields2.interrupt_sel = ++ interrupt_sel__mes_query_status__completion_status; ++ packet->bitfields2.command = ++ command__mes_query_status__fence_only_after_write_ack; ++ ++ packet->addr_hi = upper_32_bits((uint64_t)fence_address); ++ packet->addr_lo = lower_32_bits((uint64_t)fence_address); ++ packet->data_hi = upper_32_bits((uint64_t)fence_value); ++ packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ ++ return 0; ++} ++ ++ ++static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) ++{ ++ struct pm4_mec_release_mem *packet; ++ ++ packet = (struct pm4_mec_release_mem *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, ++ sizeof(struct pm4_mec_release_mem)); ++ ++ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; ++ packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; ++ packet->bitfields2.tcl1_action_ena = 1; ++ packet->bitfields2.tc_action_ena = 1; ++ packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; ++ ++ packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; ++ packet->bitfields3.int_sel = ++ int_sel__mec_release_mem__send_interrupt_after_write_confirm; ++ ++ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; ++ packet->address_hi = upper_32_bits(gpu_addr); ++ ++ packet->data_lo = 0; ++ ++ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); ++} ++ ++static uint32_t pm_get_map_process_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_map_process); ++} ++ ++static uint32_t pm_get_runlist_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_runlist); ++} ++ ++static uint32_t pm_get_map_queues_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_map_queues); ++} ++ ++static uint32_t pm_get_unmap_queues_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_unmap_queues); ++} ++ ++static uint32_t pm_get_query_status_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mes_query_status); ++} ++ ++static uint32_t pm_get_release_mem_packet_size_v9(void) ++{ ++ return sizeof(struct pm4_mec_release_mem); ++} ++ ++static struct packet_manager_funcs kfd_v9_pm_funcs = { ++ .map_process = pm_map_process_v9, ++ .runlist = pm_runlist_v9, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_v9, ++ .unmap_queues = pm_unmap_queues_v9, ++ .query_status = pm_query_status_v9, ++ .release_mem = pm_release_mem_v9, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_v9, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_v9, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_v9, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, ++}; ++ ++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ pm->pmf = &kfd_v9_pm_funcs; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +index f1d4828..9022ecb 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +@@ -22,15 +22,20 @@ + */ + + #include "kfd_kernel_queue.h" ++#include "kfd_device_queue_manager.h" ++#include "kfd_pm4_headers_vi.h" ++#include "kfd_pm4_opcodes.h" + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); + static void uninitialize_vi(struct kernel_queue *kq); ++static void submit_packet_vi(struct kernel_queue *kq); + + void kernel_queue_init_vi(struct kernel_queue_ops *ops) + { + ops->initialize = initialize_vi; + ops->uninitialize = uninitialize_vi; ++ ops->submit_packet = submit_packet_vi; + } + + static bool initialize_vi(struct kernel_queue *kq, struct kfd_dev *dev, +@@ -54,3 +59,359 @@ static void uninitialize_vi(struct kernel_queue *kq) + { + kfd_gtt_sa_free(kq->dev, kq->eop_mem); + } ++ ++static void submit_packet_vi(struct kernel_queue *kq) ++{ ++ *kq->wptr_kernel = kq->pending_wptr; ++ write_kernel_doorbell(kq->queue->properties.doorbell_ptr, ++ kq->pending_wptr); ++} ++ ++static int pm_map_process_vi(struct packet_manager *pm, ++ uint32_t *buffer, struct qcm_process_device *qpd) ++{ ++ struct pm4_mes_map_process *packet; ++ ++ packet = (struct pm4_mes_map_process *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_process)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, ++ sizeof(struct pm4_mes_map_process)); ++ packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; ++ packet->bitfields2.process_quantum = 1; ++ packet->bitfields2.pasid = qpd->pqm->process->pasid; ++ packet->bitfields3.page_table_base = qpd->page_table_base; ++ packet->bitfields10.gds_size = qpd->gds_size; ++ packet->bitfields10.num_gws = qpd->num_gws; ++ packet->bitfields10.num_oac = qpd->num_oac; ++ packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; ++ ++ packet->sh_mem_config = qpd->sh_mem_config; ++ packet->sh_mem_bases = qpd->sh_mem_bases; ++ packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; ++ packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; ++ ++ packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; ++ ++ packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); ++ packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); ++ ++ return 0; ++} ++ ++ ++unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) ++{ ++ union PM4_MES_TYPE_3_HEADER header; ++ ++ header.u32All = 0; ++ header.opcode = opcode; ++ header.count = packet_size / 4 - 2; ++ header.type = PM4_TYPE_3; ++ ++ return header.u32All; ++} ++ ++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain) ++{ ++ struct pm4_mes_runlist *packet; ++ ++ int concurrent_proc_cnt = 0; ++ struct kfd_dev *kfd = pm->dqm->dev; ++ ++ /* Determine the number of processes to map together to HW: ++ * it can not exceed the number of VMIDs available to the ++ * scheduler, and it is determined by the smaller of the number ++ * of processes in the runlist and kfd module parameter ++ * hws_max_conc_proc. ++ * Note: the arbitration between the number of VMIDs and ++ * hws_max_conc_proc has been done in ++ * kgd2kfd_device_init(). ++ */ ++ concurrent_proc_cnt = min(pm->dqm->processes_count, ++ kfd->max_proc_per_quantum); ++ ++ ++ packet = (struct pm4_mes_runlist *)buffer; ++ ++ memset(buffer, 0, sizeof(struct pm4_mes_runlist)); ++ packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, ++ sizeof(struct pm4_mes_runlist)); ++ ++ packet->bitfields4.ib_size = ib_size_in_dwords; ++ packet->bitfields4.chain = chain ? 1 : 0; ++ packet->bitfields4.offload_polling = 0; ++ packet->bitfields4.valid = 1; ++ packet->bitfields4.process_cnt = concurrent_proc_cnt; ++ packet->ordinal2 = lower_32_bits(ib); ++ packet->bitfields3.ib_base_hi = upper_32_bits(ib); ++ ++ return 0; ++} ++ ++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static) ++{ ++ struct pm4_mes_map_queues *packet; ++ bool use_static = is_static; ++ ++ packet = (struct pm4_mes_map_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, ++ sizeof(struct pm4_mes_map_queues)); ++ packet->bitfields2.alloc_format = ++ alloc_format__mes_map_queues__one_per_pipe_vi; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; ++ ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_map_queues__compute_vi; ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_compute_vi; ++ ++ switch (q->properties.type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ if (use_static) ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__normal_latency_static_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.queue_type = ++ queue_type__mes_map_queues__debug_interface_queue_vi; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = q->properties.sdma_engine_id + ++ engine_sel__mes_map_queues__sdma0_vi; ++ use_static = false; /* no static queues under SDMA */ ++ break; ++ default: ++ WARN(1, "queue type %d", q->properties.type); ++ return -EINVAL; ++ } ++ packet->bitfields3.doorbell_offset = ++ q->properties.doorbell_off; ++ ++ packet->mqd_addr_lo = ++ lower_32_bits(q->gart_mqd_addr); ++ ++ packet->mqd_addr_hi = ++ upper_32_bits(q->gart_mqd_addr); ++ ++ packet->wptr_addr_lo = ++ lower_32_bits((uint64_t)q->properties.write_ptr); ++ ++ packet->wptr_addr_hi = ++ upper_32_bits((uint64_t)q->properties.write_ptr); ++ ++ return 0; ++} ++ ++int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res) ++{ ++ struct pm4_mes_set_resources *packet; ++ ++ packet = (struct pm4_mes_set_resources *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, ++ sizeof(struct pm4_mes_set_resources)); ++ ++ packet->bitfields2.queue_type = ++ queue_type__mes_set_resources__hsa_interface_queue_hiq; ++ packet->bitfields2.vmid_mask = res->vmid_mask; ++ packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; ++ packet->bitfields7.oac_mask = res->oac_mask; ++ packet->bitfields8.gds_heap_base = res->gds_heap_base; ++ packet->bitfields8.gds_heap_size = res->gds_heap_size; ++ ++ packet->gws_mask_lo = lower_32_bits(res->gws_mask); ++ packet->gws_mask_hi = upper_32_bits(res->gws_mask); ++ ++ packet->queue_mask_lo = lower_32_bits(res->queue_mask); ++ packet->queue_mask_hi = upper_32_bits(res->queue_mask); ++ ++ return 0; ++} ++ ++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine) ++{ ++ struct pm4_mes_unmap_queues *packet; ++ ++ packet = (struct pm4_mes_unmap_queues *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, ++ sizeof(struct pm4_mes_unmap_queues)); ++ switch (type) { ++ case KFD_QUEUE_TYPE_COMPUTE: ++ case KFD_QUEUE_TYPE_DIQ: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__compute; ++ break; ++ case KFD_QUEUE_TYPE_SDMA: ++ packet->bitfields2.engine_sel = ++ engine_sel__mes_unmap_queues__sdma0 + sdma_engine; ++ break; ++ default: ++ WARN(1, "queue type %d", type); ++ return -EINVAL; ++ } ++ ++ if (reset) ++ packet->bitfields2.action = ++ action__mes_unmap_queues__reset_queues; ++ else ++ packet->bitfields2.action = ++ action__mes_unmap_queues__preempt_queues; ++ ++ switch (filter) { ++ case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues; ++ packet->bitfields2.num_queues = 1; ++ packet->bitfields3b.doorbell_offset0 = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_BY_PASID: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; ++ packet->bitfields3a.pasid = filter_param; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_queues; ++ break; ++ case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: ++ /* in this case, we do not preempt static queues */ ++ packet->bitfields2.queue_sel = ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues; ++ break; ++ default: ++ WARN(1, "filter %d", filter); ++ return -EINVAL; ++ } ++ ++ return 0; ++ ++} ++ ++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value) ++{ ++ struct pm4_mes_query_status *packet; ++ ++ packet = (struct pm4_mes_query_status *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mes_query_status)); ++ ++ ++ packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, ++ sizeof(struct pm4_mes_query_status)); ++ ++ packet->bitfields2.context_id = 0; ++ packet->bitfields2.interrupt_sel = ++ interrupt_sel__mes_query_status__completion_status; ++ packet->bitfields2.command = ++ command__mes_query_status__fence_only_after_write_ack; ++ ++ packet->addr_hi = upper_32_bits((uint64_t)fence_address); ++ packet->addr_lo = lower_32_bits((uint64_t)fence_address); ++ packet->data_hi = upper_32_bits((uint64_t)fence_value); ++ packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ ++ return 0; ++} ++ ++ ++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) ++{ ++ struct pm4_mec_release_mem *packet; ++ ++ packet = (struct pm4_mec_release_mem *)buffer; ++ memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); ++ ++ packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, ++ sizeof(struct pm4_mec_release_mem)); ++ ++ packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; ++ packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; ++ packet->bitfields2.tcl1_action_ena = 1; ++ packet->bitfields2.tc_action_ena = 1; ++ packet->bitfields2.cache_policy = cache_policy___release_mem__lru; ++ packet->bitfields2.atc = 0; ++ ++ packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low; ++ packet->bitfields3.int_sel = ++ int_sel___release_mem__send_interrupt_after_write_confirm; ++ ++ packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; ++ packet->address_hi = upper_32_bits(gpu_addr); ++ ++ packet->data_lo = 0; ++ ++ return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); ++} ++ ++uint32_t pm_get_map_process_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_map_process); ++} ++ ++uint32_t pm_get_runlist_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_runlist); ++} ++ ++uint32_t pm_get_set_resources_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_set_resources); ++} ++ ++uint32_t pm_get_map_queues_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_map_queues); ++} ++ ++uint32_t pm_get_unmap_queues_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_unmap_queues); ++} ++ ++uint32_t pm_get_query_status_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mes_query_status); ++} ++ ++uint32_t pm_get_release_mem_packet_size_vi(void) ++{ ++ return sizeof(struct pm4_mec_release_mem); ++} ++ ++ ++static struct packet_manager_funcs kfd_vi_pm_funcs = { ++ .map_process = pm_map_process_vi, ++ .runlist = pm_runlist_vi, ++ .set_resources = pm_set_resources_vi, ++ .map_queues = pm_map_queues_vi, ++ .unmap_queues = pm_unmap_queues_vi, ++ .query_status = pm_query_status_vi, ++ .release_mem = pm_release_mem_vi, ++ .get_map_process_packet_size = pm_get_map_process_packet_size_vi, ++ .get_runlist_packet_size = pm_get_runlist_packet_size_vi, ++ .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, ++ .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, ++ .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, ++ .get_query_status_packet_size = pm_get_query_status_packet_size_vi, ++ .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, ++}; ++ ++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) ++{ ++ pm->pmf = &kfd_vi_pm_funcs; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +index 3ac72be..34d44ff 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c +@@ -30,10 +30,10 @@ + #define KFD_DRIVER_AUTHOR "AMD Inc. and others" + + #define KFD_DRIVER_DESC "Standalone HSA driver for AMD's GPUs" +-#define KFD_DRIVER_DATE "20150421" +-#define KFD_DRIVER_MAJOR 0 +-#define KFD_DRIVER_MINOR 7 +-#define KFD_DRIVER_PATCHLEVEL 2 ++#define KFD_DRIVER_DATE "20160408" ++#define KFD_DRIVER_MAJOR 2 ++#define KFD_DRIVER_MINOR 0 ++#define KFD_DRIVER_PATCHLEVEL 0 + + static const struct kgd2kfd_calls kgd2kfd = { + .exit = kgd2kfd_exit, +@@ -43,6 +43,12 @@ static const struct kgd2kfd_calls kgd2kfd = { + .interrupt = kgd2kfd_interrupt, + .suspend = kgd2kfd_suspend, + .resume = kgd2kfd_resume, ++ .quiesce_mm = kgd2kfd_quiesce_mm, ++ .resume_mm = kgd2kfd_resume_mm, ++ .schedule_evict_and_restore_process = ++ kgd2kfd_schedule_evict_and_restore_process, ++ .pre_reset = kgd2kfd_pre_reset, ++ .post_reset = kgd2kfd_post_reset, + }; + + int sched_policy = KFD_SCHED_POLICY_HWS; +@@ -69,12 +75,27 @@ module_param(send_sigterm, int, 0444); + MODULE_PARM_DESC(send_sigterm, + "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); + ++static int amdkfd_init_completed; ++ ++int debug_largebar; ++module_param(debug_largebar, int, 0444); ++MODULE_PARM_DESC(debug_largebar, ++ "Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)"); ++ + int ignore_crat; + module_param(ignore_crat, int, 0444); + MODULE_PARM_DESC(ignore_crat, + "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); + +-static int amdkfd_init_completed; ++int vega10_noretry = 1; ++module_param_named(noretry, vega10_noretry, int, 0644); ++MODULE_PARM_DESC(noretry, ++ "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled, 1 = retry disabled (default))"); ++ ++int priv_cp_queues; ++module_param(priv_cp_queues, int, 0644); ++MODULE_PARM_DESC(priv_cp_queues, ++ "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); + + int kgd2kfd_init(unsigned int interface_version, + const struct kgd2kfd_calls **g2f) +@@ -126,7 +147,15 @@ static int __init kfd_module_init(void) + if (err < 0) + goto err_topology; + +- kfd_process_create_wq(); ++ err = kfd_ipc_init(); ++ if (err < 0) ++ goto err_topology; ++ ++ err = kfd_process_create_wq(); ++ if (err < 0) ++ goto err_create_wq; ++ ++ kfd_init_peer_direct(); + + kfd_debugfs_init(); + +@@ -136,6 +165,7 @@ static int __init kfd_module_init(void) + + return 0; + ++err_create_wq: + err_topology: + kfd_chardev_exit(); + err_ioctl: +@@ -147,6 +177,7 @@ static void __exit kfd_module_exit(void) + amdkfd_init_completed = 0; + + kfd_debugfs_fini(); ++ kfd_close_peer_direct(); + kfd_process_destroy_wq(); + kfd_topology_shutdown(); + kfd_chardev_exit(); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +index dfd260e..8279b74 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +@@ -23,14 +23,66 @@ + + #include "kfd_priv.h" + ++/* Mapping queue priority to pipe priority, indexed by queue priority */ ++int pipe_priority_map[] = { ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_LOW, ++ KFD_PIPE_PRIORITY_CS_MEDIUM, ++ KFD_PIPE_PRIORITY_CS_MEDIUM, ++ KFD_PIPE_PRIORITY_CS_MEDIUM, ++ KFD_PIPE_PRIORITY_CS_MEDIUM, ++ KFD_PIPE_PRIORITY_CS_HIGH, ++ KFD_PIPE_PRIORITY_CS_HIGH, ++ KFD_PIPE_PRIORITY_CS_HIGH, ++ KFD_PIPE_PRIORITY_CS_HIGH, ++ KFD_PIPE_PRIORITY_CS_HIGH ++}; ++ ++/* Mapping queue priority to SPI priority, indexed by queue priority ++ * SPI priority 2 and 3 are reserved for trap handler context save ++ */ ++int spi_priority_map[] = { ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_EXTRA_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_LOW ++}; ++ + struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) + { + switch (dev->device_info->asic_family) { + case CHIP_KAVERI: + return mqd_manager_init_cik(type, dev); ++ case CHIP_HAWAII: ++ return mqd_manager_init_cik_hawaii(type, dev); + case CHIP_CARRIZO: + return mqd_manager_init_vi(type, dev); ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: ++ return mqd_manager_init_vi_tonga(type, dev); ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ return mqd_manager_init_v9(type, dev); + default: + WARN(1, "Unexpected ASIC family %u", + dev->device_info->asic_family); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +index 8972bcf..dcaeda8 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +@@ -43,6 +43,9 @@ + * + * @is_occupied: Checks if the relevant HQD slot is occupied. + * ++ * @get_wave_state: Retrieves context save state and optionally copies the ++ * control stack, if kept in the MQD, to the given userspace address. ++ * + * @mqd_mutex: Mqd manager mutex. + * + * @dev: The kfd device structure coupled with this module. +@@ -59,7 +62,8 @@ + * per KFD_MQD_TYPE for each device. + * + */ +- ++extern int pipe_priority_map[]; ++extern int spi_priority_map[]; + struct mqd_manager { + int (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +@@ -85,6 +89,11 @@ struct mqd_manager { + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id); + ++ int (*get_wave_state)(struct mqd_manager *mm, void *mqd, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size); ++ + #if defined(CONFIG_DEBUG_FS) + int (*debugfs_show_mqd)(struct seq_file *m, void *data); + #endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +index f8ef4a0..bd44a23 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +@@ -30,6 +30,7 @@ + #include "cik_regs.h" + #include "cik_structs.h" + #include "oss/oss_2_4_sh_mask.h" ++#include "gca/gfx_7_2_sh_mask.h" + + static inline struct cik_mqd *get_mqd(void *mqd) + { +@@ -41,6 +42,68 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) + return (struct cik_sdma_rlc_registers *)mqd; + } + ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct cik_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ ++ uint32_t cu_mask_count = q->cu_mask_count; ++ const uint32_t *cu_mask = q->cu_mask; ++ int se, cu_per_sh, cu_index, i; ++ ++ if (cu_mask_count == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ ++ /* If # CU mask bits > # CUs, set it to the # of CUs */ ++ if (cu_mask_count > cu_info.cu_active_number) ++ cu_mask_count = cu_info.cu_active_number; ++ ++ cu_index = 0; ++ for (se = 0; se < cu_info.num_shader_engines; se++) { ++ cu_per_sh = 0; ++ ++ /* Get the number of CUs on this Shader Engine */ ++ for (i = 0; i < 4; i++) ++ cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); ++ ++ se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); ++ if ((cu_per_sh + (cu_index % 32)) > 32) ++ se_mask[se] |= cu_mask[(cu_index / 32) + 1] ++ << (32 - (cu_index % 32)); ++ se_mask[se] &= (1 << cu_per_sh) - 1; ++ cu_index += cu_per_sh; ++ } ++ m->compute_static_thread_mgmt_se0 = se_mask[0]; ++ m->compute_static_thread_mgmt_se1 = se_mask[1]; ++ m->compute_static_thread_mgmt_se2 = se_mask[2]; ++ m->compute_static_thread_mgmt_se3 = se_mask[3]; ++ ++ pr_debug("Update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ ++static void set_priority(struct cik_mqd *m, struct queue_properties *q) ++{ ++ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; ++ m->cp_hqd_queue_priority = q->priority; ++ m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & ++ (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | ++ (spi_priority_map[q->priority] << ++ COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); ++} ++ + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -79,10 +142,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + +- m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE | IB_ATC_EN; +- /* Although WinKFD writes this, I suspect it should not be necessary */ +- m->cp_hqd_ib_control = IB_ATC_EN | DEFAULT_MIN_IB_AVAIL_SIZE; +- + m->cp_hqd_quantum = QUANTUM_EN | QUANTUM_SCALE_1MS | + QUANTUM_DURATION(10); + +@@ -95,8 +154,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ +- m->cp_hqd_pipe_priority = 1; +- m->cp_hqd_queue_priority = 15; ++ set_priority(m, q); + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_iq_rptr = AQL_ENABLE; +@@ -170,14 +228,19 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + mms); + } + +-static int update_mqd(struct mqd_manager *mm, void *mqd, +- struct queue_properties *q) ++static int __update_mqd(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q, unsigned int atc_bit) + { + struct cik_mqd *m; + + m = get_mqd(mqd); + m->cp_hqd_pq_control = DEFAULT_RPTR_BLOCK_SIZE | +- DEFAULT_MIN_AVAIL_SIZE | PQ_ATC_EN; ++ DEFAULT_MIN_AVAIL_SIZE; ++ m->cp_hqd_ib_control = DEFAULT_MIN_IB_AVAIL_SIZE; ++ if (atc_bit) { ++ m->cp_hqd_pq_control |= PQ_ATC_EN; ++ m->cp_hqd_ib_control |= IB_ATC_EN; ++ } + + /* + * Calculating queue size which is log base 2 of actual queue size -1 +@@ -194,14 +257,33 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, + + if (q->format == KFD_QUEUE_FORMAT_AQL) + m->cp_hqd_pq_control |= NO_UPDATE_RPTR; ++ if (priv_cp_queues) ++ m->cp_hqd_pq_control |= ++ 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; ++ ++ update_cu_mask(mm, mqd, q); ++ set_priority(m, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0); ++ q->queue_percent > 0 && ++ !q->is_evicted); + + return 0; + } + ++static int update_mqd(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ return __update_mqd(mm, mqd, q, 1); ++} ++ ++static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ return __update_mqd(mm, mqd, q, 0); ++} ++ + static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) + { +@@ -228,7 +310,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0); ++ q->queue_percent > 0 && ++ !q->is_evicted); + + return 0; + } +@@ -323,8 +406,7 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + * 1 = CS_MEDIUM (typically between HP3D and GFX + * 2 = CS_HIGH (typically above HP3D) + */ +- m->cp_hqd_pipe_priority = 1; +- m->cp_hqd_queue_priority = 15; ++ set_priority(m, q); + + *mqd = m; + if (gart_addr) +@@ -360,8 +442,10 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0); ++ q->queue_percent > 0 && ++ !q->is_evicted); + ++ set_priority(m, q); + return 0; + } + +@@ -392,7 +476,7 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + +- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); ++ mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + +@@ -441,3 +525,15 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + return mqd; + } + ++struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev) ++{ ++ struct mqd_manager *mqd; ++ ++ mqd = mqd_manager_init_cik(type, dev); ++ if (!mqd) ++ return NULL; ++ if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) ++ mqd->update_mqd = update_mqd_hawaii; ++ return mqd; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +new file mode 100644 +index 0000000..f4e8efc +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +@@ -0,0 +1,523 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#include <linux/printk.h> ++#include <linux/slab.h> ++#include <linux/uaccess.h> ++#include "kfd_priv.h" ++#include "kfd_mqd_manager.h" ++#include "v9_structs.h" ++#include "gc/gc_9_0_offset.h" ++#include "gc/gc_9_0_sh_mask.h" ++#include "sdma0/sdma0_4_0_sh_mask.h" ++ ++static inline struct v9_mqd *get_mqd(void *mqd) ++{ ++ return (struct v9_mqd *)mqd; ++} ++ ++static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) ++{ ++ return (struct v9_sdma_mqd *)mqd; ++} ++ ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ ++ uint32_t cu_mask_count = q->cu_mask_count; ++ const uint32_t *cu_mask = q->cu_mask; ++ int se, cu_per_sh, cu_index, i; ++ ++ if (cu_mask_count == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ ++ /* If # CU mask bits > # CUs, set it to the # of CUs */ ++ if (cu_mask_count > cu_info.cu_active_number) ++ cu_mask_count = cu_info.cu_active_number; ++ ++ cu_index = 0; ++ for (se = 0; se < cu_info.num_shader_engines; se++) { ++ cu_per_sh = 0; ++ ++ /* Get the number of CUs on this Shader Engine */ ++ for (i = 0; i < 4; i++) ++ cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); ++ ++ se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); ++ if ((cu_per_sh + (cu_index % 32)) > 32) ++ se_mask[se] |= cu_mask[(cu_index / 32) + 1] ++ << (32 - (cu_index % 32)); ++ se_mask[se] &= (1 << cu_per_sh) - 1; ++ cu_index += cu_per_sh; ++ } ++ m->compute_static_thread_mgmt_se0 = se_mask[0]; ++ m->compute_static_thread_mgmt_se1 = se_mask[1]; ++ m->compute_static_thread_mgmt_se2 = se_mask[2]; ++ m->compute_static_thread_mgmt_se3 = se_mask[3]; ++ ++ pr_debug("update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ ++static int init_mqd(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ int retval; ++ uint64_t addr; ++ struct v9_mqd *m; ++ struct kfd_dev *kfd = mm->dev; ++ ++ /* From V9, for CWSR, the control stack is located on the next page ++ * boundary after the mqd, we will use the gtt allocation function ++ * instead of sub-allocation function. ++ */ ++ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { ++ *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); ++ if (!*mqd_mem_obj) ++ return -ENOMEM; ++ retval = kfd->kfd2kgd->init_gtt_mem_allocation(kfd->kgd, ++ ALIGN(q->ctl_stack_size, PAGE_SIZE) + ++ ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), ++ &((*mqd_mem_obj)->gtt_mem), ++ &((*mqd_mem_obj)->gpu_addr), ++ (void *)&((*mqd_mem_obj)->cpu_ptr)); ++ } else ++ retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), ++ mqd_mem_obj); ++ if (retval != 0) ++ return -ENOMEM; ++ ++ m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; ++ addr = (*mqd_mem_obj)->gpu_addr; ++ ++ memset(m, 0, sizeof(struct v9_mqd)); ++ ++ m->header = 0xC0310800; ++ m->compute_pipelinestat_enable = 1; ++ m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; ++ m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; ++ ++ m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | ++ 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; ++ ++ m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; ++ ++ m->cp_mqd_base_addr_lo = lower_32_bits(addr); ++ m->cp_mqd_base_addr_hi = upper_32_bits(addr); ++ ++ m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | ++ 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | ++ 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; ++ ++ m->cp_hqd_pipe_priority = 1; ++ m->cp_hqd_queue_priority = 15; ++ ++ if (q->format == KFD_QUEUE_FORMAT_AQL) { ++ m->cp_hqd_aql_control = ++ 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; ++ } ++ ++ if (q->tba_addr) { ++ m->compute_pgm_rsrc2 |= ++ (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); ++ } ++ ++ if (mm->dev->cwsr_enabled) { ++ m->cp_hqd_persistent_state |= ++ (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); ++ m->cp_hqd_ctx_save_base_addr_lo = ++ lower_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_base_addr_hi = ++ upper_32_bits(q->ctx_save_restore_area_address); ++ m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; ++ m->cp_hqd_cntl_stack_size = q->ctl_stack_size; ++ m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; ++ m->cp_hqd_wg_state_offset = q->ctl_stack_size; ++ } ++ ++ *mqd = m; ++ if (gart_addr) ++ *gart_addr = addr; ++ retval = mm->update_mqd(mm, m, q); ++ ++ return retval; ++} ++ ++static int load_mqd(struct mqd_manager *mm, void *mqd, ++ uint32_t pipe_id, uint32_t queue_id, ++ struct queue_properties *p, struct mm_struct *mms) ++{ ++ /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ ++ uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); ++ ++ return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, ++ (uint32_t __user *)p->write_ptr, ++ wptr_shift, 0, mms); ++} ++ ++static int update_mqd(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ ++ m = get_mqd(mqd); ++ ++ m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; ++ m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1; ++ pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); ++ ++ m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); ++ m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); ++ ++ m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); ++ m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); ++ m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); ++ ++ m->cp_hqd_pq_doorbell_control = ++ q->doorbell_off << ++ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; ++ pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", ++ m->cp_hqd_pq_doorbell_control); ++ ++ m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | ++ 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; ++ ++ /* ++ * HW does not clamp this field correctly. Maximum EOP queue size ++ * is constrained by per-SE EOP done signal count, which is 8-bit. ++ * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit ++ * more than (EOP entry count - 1) so a queue size of 0x800 dwords ++ * is safe, giving a maximum field value of 0xA. ++ */ ++ m->cp_hqd_eop_control = min(0xA, ++ order_base_2(q->eop_ring_buffer_size / 4) - 1); ++ m->cp_hqd_eop_base_addr_lo = ++ lower_32_bits(q->eop_ring_buffer_address >> 8); ++ m->cp_hqd_eop_base_addr_hi = ++ upper_32_bits(q->eop_ring_buffer_address >> 8); ++ ++ m->cp_hqd_iq_timer = 0; ++ ++ m->cp_hqd_vmid = q->vmid; ++ ++ if (q->format == KFD_QUEUE_FORMAT_AQL) { ++ m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | ++ 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; ++ m->cp_hqd_pq_doorbell_control |= ++ 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; ++ } ++ if (priv_cp_queues) ++ m->cp_hqd_pq_control |= ++ 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; ++ if (mm->dev->cwsr_enabled) ++ m->cp_hqd_ctx_save_control = 0; ++ ++ update_cu_mask(mm, mqd, q); ++ ++ q->is_active = (q->queue_size > 0 && ++ q->queue_address != 0 && ++ q->queue_percent > 0 && ++ !q->is_evicted); ++ ++ return 0; ++} ++ ++ ++static int destroy_mqd(struct mqd_manager *mm, void *mqd, ++ enum kfd_preempt_type type, ++ unsigned int timeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_destroy ++ (mm->dev->kgd, mqd, type, timeout, ++ pipe_id, queue_id); ++} ++ ++static void uninit_mqd(struct mqd_manager *mm, void *mqd, ++ struct kfd_mem_obj *mqd_mem_obj) ++{ ++ struct kfd_dev *kfd = mm->dev; ++ ++ if (mqd_mem_obj->gtt_mem) { ++ kfd->kfd2kgd->free_gtt_mem(kfd->kgd, mqd_mem_obj->gtt_mem); ++ kfree(mqd_mem_obj); ++ } else { ++ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); ++ } ++} ++ ++static bool is_occupied(struct mqd_manager *mm, void *mqd, ++ uint64_t queue_address, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_is_occupied( ++ mm->dev->kgd, queue_address, ++ pipe_id, queue_id); ++} ++ ++static int get_wave_state(struct mqd_manager *mm, void *mqd, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size) ++{ ++ struct v9_mqd *m; ++ ++ /* Control stack is located one page after MQD. */ ++ void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); ++ ++ m = get_mqd(mqd); ++ ++ *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - ++ m->cp_hqd_cntl_stack_offset; ++ *save_area_used_size = m->cp_hqd_wg_state_offset - ++ m->cp_hqd_cntl_stack_size; ++ ++ if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); ++ ++ if (retval != 0) ++ return retval; ++ ++ m = get_mqd(*mqd); ++ ++ m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | ++ 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; ++ ++ return retval; ++} ++ ++static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_mqd *m; ++ int retval = update_mqd(mm, mqd, q); ++ ++ if (retval != 0) ++ return retval; ++ ++ /* TODO: what's the point? update_mqd already does this. */ ++ m = get_mqd(mqd); ++ m->cp_hqd_vmid = q->vmid; ++ return retval; ++} ++ ++static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, ++ struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, ++ struct queue_properties *q) ++{ ++ int retval; ++ struct v9_sdma_mqd *m; ++ ++ ++ retval = kfd_gtt_sa_allocate(mm->dev, ++ sizeof(struct v9_sdma_mqd), ++ mqd_mem_obj); ++ ++ if (retval != 0) ++ return -ENOMEM; ++ ++ m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; ++ ++ memset(m, 0, sizeof(struct v9_sdma_mqd)); ++ ++ *mqd = m; ++ if (gart_addr) ++ *gart_addr = (*mqd_mem_obj)->gpu_addr; ++ ++ retval = mm->update_mqd(mm, m, q); ++ ++ return retval; ++} ++ ++static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct kfd_mem_obj *mqd_mem_obj) ++{ ++ kfd_gtt_sa_free(mm->dev, mqd_mem_obj); ++} ++ ++static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ uint32_t pipe_id, uint32_t queue_id, ++ struct queue_properties *p, struct mm_struct *mms) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, ++ (uint32_t __user *)p->write_ptr, ++ mms); ++} ++ ++#define SDMA_RLC_DUMMY_DEFAULT 0xf ++ ++static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct v9_sdma_mqd *m; ++ ++ m = get_sdma_mqd(mqd); ++ m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4) ++ << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | ++ q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | ++ 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | ++ 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; ++ ++ m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); ++ m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ m->sdmax_rlcx_doorbell_offset = ++ q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; ++ ++ m->sdma_engine_id = q->sdma_engine_id; ++ m->sdma_queue_id = q->sdma_queue_id; ++ m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; ++ ++ q->is_active = (q->queue_size > 0 && ++ q->queue_address != 0 && ++ q->queue_percent > 0 && ++ !q->is_evicted); ++ ++ return 0; ++} ++ ++/* ++ * * preempt type here is ignored because there is only one way ++ * * to preempt sdma queue ++ */ ++static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, ++ enum kfd_preempt_type type, ++ unsigned int timeout, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); ++} ++ ++static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, ++ uint64_t queue_address, uint32_t pipe_id, ++ uint32_t queue_id) ++{ ++ return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); ++} ++ ++#if defined(CONFIG_DEBUG_FS) ++ ++static int debugfs_show_mqd(struct seq_file *m, void *data) ++{ ++ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, ++ data, sizeof(struct v9_mqd), false); ++ return 0; ++} ++ ++static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) ++{ ++ seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, ++ data, sizeof(struct v9_sdma_mqd), false); ++ return 0; ++} ++ ++#endif ++ ++struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev) ++{ ++ struct mqd_manager *mqd; ++ ++ if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) ++ return NULL; ++ ++ mqd = kzalloc(sizeof(*mqd), GFP_NOIO); ++ if (!mqd) ++ return NULL; ++ ++ mqd->dev = dev; ++ ++ switch (type) { ++ case KFD_MQD_TYPE_CP: ++ case KFD_MQD_TYPE_COMPUTE: ++ mqd->init_mqd = init_mqd; ++ mqd->uninit_mqd = uninit_mqd; ++ mqd->load_mqd = load_mqd; ++ mqd->update_mqd = update_mqd; ++ mqd->destroy_mqd = destroy_mqd; ++ mqd->is_occupied = is_occupied; ++ mqd->get_wave_state = get_wave_state; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd; ++#endif ++ break; ++ case KFD_MQD_TYPE_HIQ: ++ mqd->init_mqd = init_mqd_hiq; ++ mqd->uninit_mqd = uninit_mqd; ++ mqd->load_mqd = load_mqd; ++ mqd->update_mqd = update_mqd_hiq; ++ mqd->destroy_mqd = destroy_mqd; ++ mqd->is_occupied = is_occupied; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd; ++#endif ++ break; ++ case KFD_MQD_TYPE_SDMA: ++ mqd->init_mqd = init_mqd_sdma; ++ mqd->uninit_mqd = uninit_mqd_sdma; ++ mqd->load_mqd = load_mqd_sdma; ++ mqd->update_mqd = update_mqd_sdma; ++ mqd->destroy_mqd = destroy_mqd_sdma; ++ mqd->is_occupied = is_occupied_sdma; ++#if defined(CONFIG_DEBUG_FS) ++ mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; ++#endif ++ break; ++ default: ++ kfree(mqd); ++ return NULL; ++ } ++ ++ return mqd; ++} +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +index 971aec0..eff7580 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +@@ -31,6 +31,7 @@ + #include "gca/gfx_8_0_sh_mask.h" + #include "gca/gfx_8_0_enum.h" + #include "oss/oss_3_0_sh_mask.h" ++ + #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 + + static inline struct vi_mqd *get_mqd(void *mqd) +@@ -43,6 +44,68 @@ static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) + return (struct vi_sdma_mqd *)mqd; + } + ++static void update_cu_mask(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ struct vi_mqd *m; ++ struct kfd_cu_info cu_info; ++ uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ ++ uint32_t cu_mask_count = q->cu_mask_count; ++ const uint32_t *cu_mask = q->cu_mask; ++ int se, cu_per_sh, cu_index, i; ++ ++ if (cu_mask_count == 0) ++ return; ++ ++ m = get_mqd(mqd); ++ m->compute_static_thread_mgmt_se0 = 0; ++ m->compute_static_thread_mgmt_se1 = 0; ++ m->compute_static_thread_mgmt_se2 = 0; ++ m->compute_static_thread_mgmt_se3 = 0; ++ ++ mm->dev->kfd2kgd->get_cu_info(mm->dev->kgd, &cu_info); ++ ++ /* If # CU mask bits > # CUs, set it to the # of CUs */ ++ if (cu_mask_count > cu_info.cu_active_number) ++ cu_mask_count = cu_info.cu_active_number; ++ ++ cu_index = 0; ++ for (se = 0; se < cu_info.num_shader_engines; se++) { ++ cu_per_sh = 0; ++ ++ /* Get the number of CUs on this Shader Engine */ ++ for (i = 0; i < 4; i++) ++ cu_per_sh += hweight32(cu_info.cu_bitmap[se][i]); ++ ++ se_mask[se] = cu_mask[cu_index / 32] >> (cu_index % 32); ++ if ((cu_per_sh + (cu_index % 32)) > 32) ++ se_mask[se] |= cu_mask[(cu_index / 32) + 1] ++ << (32 - (cu_index % 32)); ++ se_mask[se] &= (1 << cu_per_sh) - 1; ++ cu_index += cu_per_sh; ++ } ++ m->compute_static_thread_mgmt_se0 = se_mask[0]; ++ m->compute_static_thread_mgmt_se1 = se_mask[1]; ++ m->compute_static_thread_mgmt_se2 = se_mask[2]; ++ m->compute_static_thread_mgmt_se3 = se_mask[3]; ++ ++ pr_debug("Update cu mask to %#x %#x %#x %#x\n", ++ m->compute_static_thread_mgmt_se0, ++ m->compute_static_thread_mgmt_se1, ++ m->compute_static_thread_mgmt_se2, ++ m->compute_static_thread_mgmt_se3); ++} ++ ++static void set_priority(struct vi_mqd *m, struct queue_properties *q) ++{ ++ m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; ++ m->cp_hqd_queue_priority = q->priority; ++ m->compute_pgm_rsrc1 = (m->compute_pgm_rsrc1 & ++ (~COMPUTE_PGM_RSRC1__PRIORITY_MASK)) | ++ (spi_priority_map[q->priority] << ++ COMPUTE_PGM_RSRC1__PRIORITY__SHIFT); ++} ++ + static int init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -81,9 +144,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + +- m->cp_hqd_pipe_priority = 1; +- m->cp_hqd_queue_priority = 15; +- ++ set_priority(m, q); + m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; + + if (q->format == KFD_QUEUE_FORMAT_AQL) +@@ -98,7 +159,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, + (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); + } + +- if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { ++ if (mm->dev->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = +@@ -151,6 +212,8 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); ++ m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); ++ m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << +@@ -188,15 +251,21 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT; + } +- +- if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) ++ if (priv_cp_queues) ++ m->cp_hqd_pq_control |= ++ 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; ++ if (mm->dev->cwsr_enabled) + m->cp_hqd_ctx_save_control = + atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | + mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; + ++ update_cu_mask(mm, mqd, q); ++ set_priority(m, q); ++ + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0); ++ q->queue_percent > 0 && ++ !q->is_evicted); + + return 0; + } +@@ -208,6 +277,12 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, + return __update_mqd(mm, mqd, q, MTYPE_CC, 1); + } + ++static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, ++ struct queue_properties *q) ++{ ++ return __update_mqd(mm, mqd, q, MTYPE_UC, 0); ++} ++ + static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, +@@ -233,6 +308,28 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd, + pipe_id, queue_id); + } + ++static int get_wave_state(struct mqd_manager *mm, void *mqd, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size) ++{ ++ struct vi_mqd *m; ++ ++ m = get_mqd(mqd); ++ ++ *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - ++ m->cp_hqd_cntl_stack_offset; ++ *save_area_used_size = m->cp_hqd_wg_state_offset - ++ m->cp_hqd_cntl_stack_size; ++ ++ /* Control stack is not copied to user mode for GFXv8 because ++ * it's part of the context save area that is already ++ * accessible to user mode ++ */ ++ ++ return 0; ++} ++ + static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +@@ -285,7 +382,7 @@ static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, + memset(m, 0, sizeof(struct vi_sdma_mqd)); + + *mqd = m; +- if (gart_addr != NULL) ++ if (gart_addr) + *gart_addr = (*mqd_mem_obj)->gpu_addr; + + retval = mm->update_mqd(mm, m, q); +@@ -334,7 +431,8 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && +- q->queue_percent > 0); ++ q->queue_percent > 0 && ++ !q->is_evicted); + + return 0; + } +@@ -384,7 +482,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + +- mqd = kzalloc(sizeof(*mqd), GFP_KERNEL); ++ mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + +@@ -399,6 +497,7 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; ++ mqd->get_wave_state = get_wave_state; + #if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; + #endif +@@ -432,3 +531,17 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + + return mqd; + } ++ ++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev) ++{ ++ struct mqd_manager *mqd; ++ ++ mqd = mqd_manager_init_vi(type, dev); ++ if (!mqd) ++ return NULL; ++ if ((type == KFD_MQD_TYPE_CP) || (type == KFD_MQD_TYPE_COMPUTE)) ++ mqd->update_mqd = update_mqd_tonga; ++ return mqd; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 0ecbd1f..98c89d2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -26,7 +26,6 @@ + #include "kfd_device_queue_manager.h" + #include "kfd_kernel_queue.h" + #include "kfd_priv.h" +-#include "kfd_pm4_headers_vi.h" + #include "kfd_pm4_opcodes.h" + + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, +@@ -39,18 +38,6 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, + *wptr = temp; + } + +-static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size) +-{ +- union PM4_MES_TYPE_3_HEADER header; +- +- header.u32All = 0; +- header.opcode = opcode; +- header.count = packet_size / 4 - 2; +- header.type = PM4_TYPE_3; +- +- return header.u32All; +-} +- + static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) +@@ -58,7 +45,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int process_count, queue_count, compute_queue_count; + unsigned int map_queue_size; + unsigned int max_proc_per_quantum = 1; +- struct kfd_dev *dev = pm->dqm->dev; ++ ++ struct kfd_dev *dev = pm->dqm->dev; + + process_count = pm->dqm->processes_count; + queue_count = pm->dqm->queue_count; +@@ -69,20 +57,21 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ ++ + *over_subscription = false; + + if (dev->max_proc_per_quantum > 1) + max_proc_per_quantum = dev->max_proc_per_quantum; + + if ((process_count > max_proc_per_quantum) || +- compute_queue_count > get_queues_num(pm->dqm)) { ++ compute_queue_count > get_queues_num(pm->dqm)) { + *over_subscription = true; + pr_debug("Over subscribed runlist\n"); + } + +- map_queue_size = sizeof(struct pm4_mes_map_queues); ++ map_queue_size = pm->pmf->get_map_queues_packet_size(); + /* calculate run list ib allocation size */ +- *rlib_size = process_count * sizeof(struct pm4_mes_map_process) + ++ *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + + queue_count * map_queue_size; + + /* +@@ -90,7 +79,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + * when over subscription + */ + if (*over_subscription) +- *rlib_size += sizeof(struct pm4_mes_runlist); ++ *rlib_size += pm->pmf->get_runlist_packet_size(); + + pr_debug("runlist ib size %d\n", *rlib_size); + } +@@ -108,12 +97,14 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + ++ mutex_lock(&pm->lock); ++ + retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, + &pm->ib_buffer_obj); + + if (retval) { + pr_err("Failed to allocate runlist IB\n"); +- return retval; ++ goto out; + } + + *(void **)rl_buffer = pm->ib_buffer_obj->cpu_ptr; +@@ -121,139 +112,10 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm, + + memset(*rl_buffer, 0, *rl_buffer_size); + pm->allocated = true; +- return retval; +-} +- +-static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer, +- uint64_t ib, size_t ib_size_in_dwords, bool chain) +-{ +- struct pm4_mes_runlist *packet; +- int concurrent_proc_cnt = 0; +- struct kfd_dev *kfd = pm->dqm->dev; + +- if (WARN_ON(!ib)) +- return -EFAULT; +- +- /* Determine the number of processes to map together to HW: +- * it can not exceed the number of VMIDs available to the +- * scheduler, and it is determined by the smaller of the number +- * of processes in the runlist and kfd module parameter +- * hws_max_conc_proc. +- * Note: the arbitration between the number of VMIDs and +- * hws_max_conc_proc has been done in +- * kgd2kfd_device_init(). +- */ +- concurrent_proc_cnt = min(pm->dqm->processes_count, +- kfd->max_proc_per_quantum); +- +- packet = (struct pm4_mes_runlist *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_mes_runlist)); +- packet->header.u32All = build_pm4_header(IT_RUN_LIST, +- sizeof(struct pm4_mes_runlist)); +- +- packet->bitfields4.ib_size = ib_size_in_dwords; +- packet->bitfields4.chain = chain ? 1 : 0; +- packet->bitfields4.offload_polling = 0; +- packet->bitfields4.valid = 1; +- packet->bitfields4.process_cnt = concurrent_proc_cnt; +- packet->ordinal2 = lower_32_bits(ib); +- packet->bitfields3.ib_base_hi = upper_32_bits(ib); +- +- return 0; +-} +- +-static int pm_create_map_process(struct packet_manager *pm, uint32_t *buffer, +- struct qcm_process_device *qpd) +-{ +- struct pm4_mes_map_process *packet; +- +- packet = (struct pm4_mes_map_process *)buffer; +- +- memset(buffer, 0, sizeof(struct pm4_mes_map_process)); +- +- packet->header.u32All = build_pm4_header(IT_MAP_PROCESS, +- sizeof(struct pm4_mes_map_process)); +- packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; +- packet->bitfields2.process_quantum = 1; +- packet->bitfields2.pasid = qpd->pqm->process->pasid; +- packet->bitfields3.page_table_base = qpd->page_table_base; +- packet->bitfields10.gds_size = qpd->gds_size; +- packet->bitfields10.num_gws = qpd->num_gws; +- packet->bitfields10.num_oac = qpd->num_oac; +- packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +- +- packet->sh_mem_config = qpd->sh_mem_config; +- packet->sh_mem_bases = qpd->sh_mem_bases; +- packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; +- packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; +- +- /* TODO: scratch support */ +- packet->sh_hidden_private_base_vmid = 0; +- +- packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); +- packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); +- +- return 0; +-} +- +-static int pm_create_map_queue(struct packet_manager *pm, uint32_t *buffer, +- struct queue *q, bool is_static) +-{ +- struct pm4_mes_map_queues *packet; +- bool use_static = is_static; +- +- packet = (struct pm4_mes_map_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); +- +- packet->header.u32All = build_pm4_header(IT_MAP_QUEUES, +- sizeof(struct pm4_mes_map_queues)); +- packet->bitfields2.alloc_format = +- alloc_format__mes_map_queues__one_per_pipe_vi; +- packet->bitfields2.num_queues = 1; +- packet->bitfields2.queue_sel = +- queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; +- +- packet->bitfields2.engine_sel = +- engine_sel__mes_map_queues__compute_vi; +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_compute_vi; +- +- switch (q->properties.type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- if (use_static) +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__normal_latency_static_queue_vi; +- break; +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.queue_type = +- queue_type__mes_map_queues__debug_interface_queue_vi; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = q->properties.sdma_engine_id + +- engine_sel__mes_map_queues__sdma0_vi; +- use_static = false; /* no static queues under SDMA */ +- break; +- default: +- WARN(1, "queue type %d", q->properties.type); +- return -EINVAL; +- } +- packet->bitfields3.doorbell_offset = +- q->properties.doorbell_off; +- +- packet->mqd_addr_lo = +- lower_32_bits(q->gart_mqd_addr); +- +- packet->mqd_addr_hi = +- upper_32_bits(q->gart_mqd_addr); +- +- packet->wptr_addr_lo = +- lower_32_bits((uint64_t)q->properties.write_ptr); +- +- packet->wptr_addr_hi = +- upper_32_bits((uint64_t)q->properties.write_ptr); +- +- return 0; ++out: ++ mutex_unlock(&pm->lock); ++ return retval; + } + + static int pm_create_runlist_ib(struct packet_manager *pm, +@@ -293,12 +155,12 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return -ENOMEM; + } + +- retval = pm_create_map_process(pm, &rl_buffer[rl_wptr], qpd); ++ retval = pm->pmf->map_process(pm, &rl_buffer[rl_wptr], qpd); + if (retval) + return retval; + + proccesses_mapped++; +- inc_wptr(&rl_wptr, sizeof(struct pm4_mes_map_process), ++ inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), + alloc_size_bytes); + + list_for_each_entry(kq, &qpd->priv_queue_list, list) { +@@ -308,7 +170,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + kq->queue->queue, qpd->is_debug); + +- retval = pm_create_map_queue(pm, ++ retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + kq->queue, + qpd->is_debug); +@@ -316,7 +178,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return retval; + + inc_wptr(&rl_wptr, +- sizeof(struct pm4_mes_map_queues), ++ pm->pmf->get_map_queues_packet_size(), + alloc_size_bytes); + } + +@@ -327,16 +189,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + q->queue, qpd->is_debug); + +- retval = pm_create_map_queue(pm, ++ retval = pm->pmf->map_queues(pm, + &rl_buffer[rl_wptr], + q, + qpd->is_debug); +- + if (retval) + return retval; + + inc_wptr(&rl_wptr, +- sizeof(struct pm4_mes_map_queues), ++ pm->pmf->get_map_queues_packet_size(), + alloc_size_bytes); + } + } +@@ -344,7 +205,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + pr_debug("Finished map process and queues to runlist\n"); + + if (is_over_subscription) +- retval = pm_create_runlist(pm, &rl_buffer[rl_wptr], ++ retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], + *rl_gpu_addr, + alloc_size_bytes / sizeof(uint32_t), + true); +@@ -356,7 +217,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, + return retval; + } + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, ++ uint16_t fw_ver) + { + pm->dqm = dqm; + mutex_init(&pm->lock); +@@ -367,6 +229,26 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) + } + pm->allocated = false; + ++ switch (pm->dqm->dev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ case CHIP_HAWAII: ++ kfd_pm_func_init_cik(pm, fw_ver); ++ break; ++ case CHIP_CARRIZO: ++ case CHIP_TONGA: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: ++ kfd_pm_func_init_vi(pm, fw_ver); ++ break; ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ kfd_pm_func_init_v9(pm, fw_ver); ++ break; ++ default: ++ BUG(); ++ } ++ + return 0; + } + +@@ -379,38 +261,25 @@ void pm_uninit(struct packet_manager *pm) + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res) + { +- struct pm4_mes_set_resources *packet; ++ uint32_t *buffer, size; + int retval = 0; + ++ size = pm->pmf->get_set_resources_packet_size(); + mutex_lock(&pm->lock); + pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +- sizeof(*packet) / sizeof(uint32_t), +- (unsigned int **)&packet); +- if (!packet) { ++ size / sizeof(uint32_t), ++ (unsigned int **)&buffer); ++ if (!buffer) { + pr_err("Failed to allocate buffer on kernel queue\n"); + retval = -ENOMEM; + goto out; + } + +- memset(packet, 0, sizeof(struct pm4_mes_set_resources)); +- packet->header.u32All = build_pm4_header(IT_SET_RESOURCES, +- sizeof(struct pm4_mes_set_resources)); +- +- packet->bitfields2.queue_type = +- queue_type__mes_set_resources__hsa_interface_queue_hiq; +- packet->bitfields2.vmid_mask = res->vmid_mask; +- packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; +- packet->bitfields7.oac_mask = res->oac_mask; +- packet->bitfields8.gds_heap_base = res->gds_heap_base; +- packet->bitfields8.gds_heap_size = res->gds_heap_size; +- +- packet->gws_mask_lo = lower_32_bits(res->gws_mask); +- packet->gws_mask_hi = upper_32_bits(res->gws_mask); +- +- packet->queue_mask_lo = lower_32_bits(res->queue_mask); +- packet->queue_mask_hi = upper_32_bits(res->queue_mask); +- +- pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ retval = pm->pmf->set_resources(pm, buffer, res); ++ if (!retval) ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ else ++ pm->priv_queue->ops.rollback_packet(pm->priv_queue); + + out: + mutex_unlock(&pm->lock); +@@ -432,7 +301,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + + pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); + +- packet_size_dwords = sizeof(struct pm4_mes_runlist) / sizeof(uint32_t); ++ packet_size_dwords = pm->pmf->get_runlist_packet_size() / ++ sizeof(uint32_t); + mutex_lock(&pm->lock); + + retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, +@@ -440,8 +310,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + if (retval) + goto fail_acquire_packet_buffer; + +- retval = pm_create_runlist(pm, rl_buffer, rl_gpu_ib_addr, +- rl_ib_size / sizeof(uint32_t), false); ++ retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, ++ rl_ib_size / sizeof(uint32_t), false); + if (retval) + goto fail_create_runlist; + +@@ -463,37 +333,29 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) + int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, + uint32_t fence_value) + { +- int retval; +- struct pm4_mes_query_status *packet; ++ uint32_t *buffer, size; ++ int retval = 0; + + if (WARN_ON(!fence_address)) + return -EFAULT; + ++ size = pm->pmf->get_query_status_packet_size(); + mutex_lock(&pm->lock); +- retval = pm->priv_queue->ops.acquire_packet_buffer( +- pm->priv_queue, +- sizeof(struct pm4_mes_query_status) / sizeof(uint32_t), +- (unsigned int **)&packet); +- if (retval) +- goto fail_acquire_packet_buffer; +- +- packet->header.u32All = build_pm4_header(IT_QUERY_STATUS, +- sizeof(struct pm4_mes_query_status)); +- +- packet->bitfields2.context_id = 0; +- packet->bitfields2.interrupt_sel = +- interrupt_sel__mes_query_status__completion_status; +- packet->bitfields2.command = +- command__mes_query_status__fence_only_after_write_ack; +- +- packet->addr_hi = upper_32_bits((uint64_t)fence_address); +- packet->addr_lo = lower_32_bits((uint64_t)fence_address); +- packet->data_hi = upper_32_bits((uint64_t)fence_value); +- packet->data_lo = lower_32_bits((uint64_t)fence_value); ++ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, ++ size / sizeof(uint32_t), (unsigned int **)&buffer); ++ if (!buffer) { ++ pr_err("Failed to allocate buffer on kernel queue\n"); ++ retval = -ENOMEM; ++ goto out; ++ } + +- pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); ++ if (!retval) ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); ++ else ++ pm->priv_queue->ops.rollback_packet(pm->priv_queue); + +-fail_acquire_packet_buffer: ++out: + mutex_unlock(&pm->lock); + return retval; + } +@@ -503,82 +365,27 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) + { +- int retval; +- uint32_t *buffer; +- struct pm4_mes_unmap_queues *packet; ++ uint32_t *buffer, size; ++ int retval = 0; + ++ size = pm->pmf->get_unmap_queues_packet_size(); + mutex_lock(&pm->lock); +- retval = pm->priv_queue->ops.acquire_packet_buffer( +- pm->priv_queue, +- sizeof(struct pm4_mes_unmap_queues) / sizeof(uint32_t), +- &buffer); +- if (retval) +- goto err_acquire_packet_buffer; +- +- packet = (struct pm4_mes_unmap_queues *)buffer; +- memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); +- pr_debug("static_queue: unmapping queues: filter is %d , reset is %d , type is %d\n", +- filter, reset, type); +- packet->header.u32All = build_pm4_header(IT_UNMAP_QUEUES, +- sizeof(struct pm4_mes_unmap_queues)); +- switch (type) { +- case KFD_QUEUE_TYPE_COMPUTE: +- case KFD_QUEUE_TYPE_DIQ: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__compute; +- break; +- case KFD_QUEUE_TYPE_SDMA: +- packet->bitfields2.engine_sel = +- engine_sel__mes_unmap_queues__sdma0 + sdma_engine; +- break; +- default: +- WARN(1, "queue type %d", type); +- retval = -EINVAL; +- goto err_invalid; ++ pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, ++ size / sizeof(uint32_t), (unsigned int **)&buffer); ++ if (!buffer) { ++ pr_err("Failed to allocate buffer on kernel queue\n"); ++ retval = -ENOMEM; ++ goto out; + } + +- if (reset) +- packet->bitfields2.action = +- action__mes_unmap_queues__reset_queues; ++ retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, ++ reset, sdma_engine); ++ if (!retval) ++ pm->priv_queue->ops.submit_packet(pm->priv_queue); + else +- packet->bitfields2.action = +- action__mes_unmap_queues__preempt_queues; +- +- switch (filter) { +- case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_specified_queues; +- packet->bitfields2.num_queues = 1; +- packet->bitfields3b.doorbell_offset0 = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_BY_PASID: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; +- packet->bitfields3a.pasid = filter_param; +- break; +- case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_queues; +- break; +- case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: +- /* in this case, we do not preempt static queues */ +- packet->bitfields2.queue_sel = +- queue_sel__mes_unmap_queues__unmap_all_non_static_queues; +- break; +- default: +- WARN(1, "filter %d", filter); +- retval = -EINVAL; +- goto err_invalid; +- } ++ pm->priv_queue->ops.rollback_packet(pm->priv_queue); + +- pm->priv_queue->ops.submit_packet(pm->priv_queue); +- +- mutex_unlock(&pm->lock); +- return 0; +- +-err_invalid: +- pm->priv_queue->ops.rollback_packet(pm->priv_queue); +-err_acquire_packet_buffer: ++out: + mutex_unlock(&pm->lock); + return retval; + } +@@ -593,25 +400,17 @@ void pm_release_ib(struct packet_manager *pm) + mutex_unlock(&pm->lock); + } + +-#if defined(CONFIG_DEBUG_FS) +- + int pm_debugfs_runlist(struct seq_file *m, void *data) + { + struct packet_manager *pm = data; + +- mutex_lock(&pm->lock); +- + if (!pm->allocated) { + seq_puts(m, " No active runlist\n"); +- goto out; ++ return 0; + } + + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); + +-out: +- mutex_unlock(&pm->lock); + return 0; + } +- +-#endif +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +new file mode 100644 +index 0000000..fae8e8c +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +@@ -0,0 +1,515 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++ ++/* NOTE: ++ * ++ * This file contains logic to dynamically detect and enable PeerDirect ++ * suppor. PeerDirect support is delivered e.g. as part of OFED ++ * from Mellanox. Because we are not able to rely on the fact that the ++ * corresponding OFED will be installed we should: ++ * - copy PeerDirect definitions locally to avoid dependency on ++ * corresponding header file ++ * - try dynamically detect address of PeerDirect function ++ * pointers. ++ * ++ * If dynamic detection failed then PeerDirect support should be ++ * enabled using the standard PeerDirect bridge driver from: ++ * https://github.com/RadeonOpenCompute/ROCnRDMA ++ * ++ * ++ * Logic to support PeerDirect relies only on official public API to be ++ * non-intrusive as much as possible. ++ * ++ **/ ++ ++#include <linux/device.h> ++#include <linux/export.h> ++#include <linux/pid.h> ++#include <linux/err.h> ++#include <linux/slab.h> ++#include <linux/scatterlist.h> ++#include <linux/module.h> ++ ++#include "kfd_priv.h" ++#include "amd_rdma.h" ++ ++ ++ ++/* ----------------------- PeerDirect interface ------------------------------*/ ++ ++/* ++ * Copyright (c) 2013, Mellanox Technologies. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#define IB_PEER_MEMORY_NAME_MAX 64 ++#define IB_PEER_MEMORY_VER_MAX 16 ++ ++struct peer_memory_client { ++ char name[IB_PEER_MEMORY_NAME_MAX]; ++ char version[IB_PEER_MEMORY_VER_MAX]; ++ /* acquire return code: 1-mine, 0-not mine */ ++ int (*acquire)(unsigned long addr, size_t size, ++ void *peer_mem_private_data, ++ char *peer_mem_name, ++ void **client_context); ++ int (*get_pages)(unsigned long addr, ++ size_t size, int write, int force, ++ struct sg_table *sg_head, ++ void *client_context, void *core_context); ++ int (*dma_map)(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device, int dmasync, int *nmap); ++ int (*dma_unmap)(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device); ++ void (*put_pages)(struct sg_table *sg_head, void *client_context); ++ unsigned long (*get_page_size)(void *client_context); ++ void (*release)(void *client_context); ++ void* (*get_context_private_data)(u64 peer_id); ++ void (*put_context_private_data)(void *context); ++}; ++ ++typedef int (*invalidate_peer_memory)(void *reg_handle, ++ void *core_context); ++ ++void *ib_register_peer_memory_client(struct peer_memory_client *peer_client, ++ invalidate_peer_memory *invalidate_callback); ++void ib_unregister_peer_memory_client(void *reg_handle); ++ ++ ++/*------------------- PeerDirect bridge driver ------------------------------*/ ++ ++#define AMD_PEER_BRIDGE_DRIVER_VERSION "1.0" ++#define AMD_PEER_BRIDGE_DRIVER_NAME "amdkfd" ++ ++ ++static void* (*pfn_ib_register_peer_memory_client)(struct peer_memory_client ++ *peer_client, ++ invalidate_peer_memory ++ *invalidate_callback); ++ ++static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); ++ ++static const struct amd_rdma_interface *rdma_interface; ++ ++static invalidate_peer_memory ib_invalidate_callback; ++static void *ib_reg_handle; ++ ++struct amd_mem_context { ++ uint64_t va; ++ uint64_t size; ++ struct pid *pid; ++ ++ struct amd_p2p_info *p2p_info; ++ ++ /* Flag that free callback was called */ ++ int free_callback_called; ++ ++ /* Context received from PeerDirect call */ ++ void *core_context; ++}; ++ ++ ++static void free_callback(void *client_priv) ++{ ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_priv; ++ ++ pr_debug("data 0x%p\n", mem_context); ++ ++ if (!mem_context) { ++ pr_warn("Invalid client context\n"); ++ return; ++ } ++ ++ pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); ++ ++ /* Call back IB stack asking to invalidate memory */ ++ (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); ++ ++ /* amdkfd will free resources when we return from this callback. ++ * Set flag to inform that there is nothing to do on "put_pages", etc. ++ */ ++ WRITE_ONCE(mem_context->free_callback_called, 1); ++} ++ ++ ++static int amd_acquire(unsigned long addr, size_t size, ++ void *peer_mem_private_data, ++ char *peer_mem_name, void **client_context) ++{ ++ int ret; ++ struct amd_mem_context *mem_context; ++ struct pid *pid; ++ ++ /* Get pointer to structure describing current process */ ++ pid = get_task_pid(current, PIDTYPE_PID); ++ ++ pr_debug("addr:0x%lx,size:0x%x, pid 0x%p\n", ++ addr, (unsigned int)size, pid); ++ ++ /* Check if address is handled by AMD GPU driver */ ++ ret = rdma_interface->is_gpu_address(addr, pid); ++ ++ if (!ret) { ++ pr_debug("Not GPU Address\n"); ++ /* This is not GPU address */ ++ return 0; ++ } ++ ++ pr_debug("GPU address\n"); ++ ++ /* Initialize context used for operation with given address */ ++ mem_context = kzalloc(sizeof(*mem_context), GFP_KERNEL); ++ ++ if (!mem_context) ++ return 0; /* Error case handled as not GPU address */ ++ ++ mem_context->free_callback_called = 0; ++ mem_context->va = addr; ++ mem_context->size = size; ++ ++ /* Save PID. It is guaranteed that the function will be ++ * called in the correct process context as opposite to others. ++ */ ++ mem_context->pid = pid; ++ ++ pr_debug("Client context %p\n", mem_context); ++ ++ /* Return pointer to allocated context */ ++ *client_context = mem_context; ++ ++ /* Return 1 to inform that this address which will be handled ++ * by AMD GPU driver ++ */ ++ return 1; ++} ++ ++static int amd_get_pages(unsigned long addr, size_t size, int write, int force, ++ struct sg_table *sg_head, ++ void *client_context, void *core_context) ++{ ++ int ret; ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("addr:0x%lx,size:0x%x, core_context:%p\n", ++ addr, (unsigned int)size, core_context); ++ ++ if (!mem_context) { ++ pr_warn("Invalid client context"); ++ return -EINVAL; ++ } ++ ++ pr_debug("pid :0x%p\n", mem_context->pid); ++ ++ ++ if (addr != mem_context->va) { ++ pr_warn("Context address (0x%llx) is not the same\n", ++ mem_context->va); ++ return -EINVAL; ++ } ++ ++ if (size != mem_context->size) { ++ pr_warn("Context size (0x%llx) is not the same\n", ++ mem_context->size); ++ return -EINVAL; ++ } ++ ++ ret = rdma_interface->get_pages(addr, ++ size, ++ mem_context->pid, ++ &mem_context->p2p_info, ++ free_callback, ++ mem_context); ++ ++ if (ret || !mem_context->p2p_info) { ++ pr_err("Could not rdma::get_pages failure: %d\n", ret); ++ return ret; ++ } ++ ++ mem_context->core_context = core_context; ++ ++ /* Note: At this stage it is OK not to fill sg_table */ ++ return 0; ++} ++ ++ ++static int amd_dma_map(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device, int dmasync, int *nmap) ++{ ++ /* ++ * NOTE/TODO: ++ * We could have potentially three cases for real memory ++ * location: ++ * - all memory in the local ++ * - all memory in the system (RAM) ++ * - memory is spread (s/g) between local and system. ++ * ++ * In the case of all memory in the system we could use ++ * iommu driver to build DMA addresses but not in the case ++ * of local memory because currently iommu driver doesn't ++ * deal with local/device memory addresses (it requires "struct ++ * page"). ++ * ++ * Accordingly returning assumes that iommu funcutionality ++ * should be disabled so we can assume that sg_table already ++ * contains DMA addresses. ++ * ++ */ ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("Context 0x%p, sg_head 0x%p\n", ++ client_context, sg_head); ++ ++ pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", ++ mem_context->pid, ++ mem_context->va, ++ mem_context->size); ++ ++ if (!mem_context->p2p_info) { ++ pr_err("No sg table were allocated\n"); ++ return -EINVAL; ++ } ++ ++ /* Copy information about previosly allocated sg_table */ ++ *sg_head = *mem_context->p2p_info->pages; ++ ++ /* Return number of pages */ ++ *nmap = mem_context->p2p_info->pages->nents; ++ ++ return 0; ++} ++ ++static int amd_dma_unmap(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device) ++{ ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("Context 0x%p, sg_table 0x%p\n", ++ client_context, sg_head); ++ ++ pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", ++ mem_context->pid, ++ mem_context->va, ++ mem_context->size); ++ ++ /* Assume success */ ++ return 0; ++} ++static void amd_put_pages(struct sg_table *sg_head, void *client_context) ++{ ++ int ret = 0; ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("sg_head %p client_context: 0x%p\n", ++ sg_head, client_context); ++ pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", ++ mem_context->pid, ++ mem_context->va, ++ mem_context->size); ++ ++ pr_debug("mem_context->p2p_info %p\n", ++ mem_context->p2p_info); ++ ++ if (mem_context->free_callback_called) { ++ READ_ONCE(mem_context->free_callback_called); ++ pr_debug("Free callback was called\n"); ++ return; ++ } ++ ++ if (mem_context->p2p_info) { ++ ret = rdma_interface->put_pages(&mem_context->p2p_info); ++ mem_context->p2p_info = NULL; ++ ++ if (ret) ++ pr_err("Failure: %d (callback status %d)\n", ++ ret, mem_context->free_callback_called); ++ } else ++ pr_err("Pointer to p2p info is null\n"); ++} ++static unsigned long amd_get_page_size(void *client_context) ++{ ++ unsigned long page_size; ++ int result; ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("context: %p\n", client_context); ++ pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", ++ mem_context->pid, ++ mem_context->va, ++ mem_context->size); ++ ++ ++ result = rdma_interface->get_page_size( ++ mem_context->va, ++ mem_context->size, ++ mem_context->pid, ++ &page_size); ++ ++ if (result) { ++ pr_err("Could not get page size. %d\n", result); ++ /* If we failed to get page size then do not know what to do. ++ * Let's return some default value ++ */ ++ return PAGE_SIZE; ++ } ++ ++ return page_size; ++} ++ ++static void amd_release(void *client_context) ++{ ++ struct amd_mem_context *mem_context = ++ (struct amd_mem_context *)client_context; ++ ++ pr_debug("context: 0x%p\n", client_context); ++ pr_debug("pid 0x%p, address 0x%llx, size:0x%llx\n", ++ mem_context->pid, ++ mem_context->va, ++ mem_context->size); ++ ++ kfree(mem_context); ++} ++ ++ ++static struct peer_memory_client amd_mem_client = { ++ .acquire = amd_acquire, ++ .get_pages = amd_get_pages, ++ .dma_map = amd_dma_map, ++ .dma_unmap = amd_dma_unmap, ++ .put_pages = amd_put_pages, ++ .get_page_size = amd_get_page_size, ++ .release = amd_release, ++ .get_context_private_data = NULL, ++ .put_context_private_data = NULL, ++}; ++ ++/** Initialize PeerDirect interface with RDMA Network stack. ++ * ++ * Because network stack could potentially be loaded later we check ++ * presence of PeerDirect when HSA process is created. If PeerDirect was ++ * already initialized we do nothing otherwise try to detect and register. ++ */ ++void kfd_init_peer_direct(void) ++{ ++ int result; ++ ++ if (pfn_ib_unregister_peer_memory_client) { ++ pr_debug("PeerDirect support was already initialized\n"); ++ return; ++ } ++ ++ pr_debug("Try to initialize PeerDirect support\n"); ++ ++ pfn_ib_register_peer_memory_client = ++ (void *(*)(struct peer_memory_client *, ++ invalidate_peer_memory *)) ++ symbol_request(ib_register_peer_memory_client); ++ ++ pfn_ib_unregister_peer_memory_client = (void (*)(void *)) ++ symbol_request(ib_unregister_peer_memory_client); ++ ++ if (!pfn_ib_register_peer_memory_client || ++ !pfn_ib_unregister_peer_memory_client) { ++ pr_debug("PeerDirect interface was not detected\n"); ++ /* Do cleanup */ ++ kfd_close_peer_direct(); ++ return; ++ } ++ ++ result = amdkfd_query_rdma_interface(&rdma_interface); ++ ++ if (result < 0) { ++ pr_err("Cannot get RDMA Interface (result = %d)\n", result); ++ return; ++ } ++ ++ strcpy(amd_mem_client.name, AMD_PEER_BRIDGE_DRIVER_NAME); ++ strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); ++ ++ ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, ++ &ib_invalidate_callback); ++ ++ if (!ib_reg_handle) { ++ pr_err("Cannot register peer memory client\n"); ++ /* Do cleanup */ ++ kfd_close_peer_direct(); ++ return; ++ } ++ ++ pr_info("PeerDirect support was initialized successfully\n"); ++} ++ ++/** ++ * Close connection with PeerDirect interface with RDMA Network stack. ++ * ++ */ ++void kfd_close_peer_direct(void) ++{ ++ if (pfn_ib_unregister_peer_memory_client) { ++ if (ib_reg_handle) ++ pfn_ib_unregister_peer_memory_client(ib_reg_handle); ++ ++ symbol_put(ib_unregister_peer_memory_client); ++ } ++ ++ if (pfn_ib_register_peer_memory_client) ++ symbol_put(ib_register_peer_memory_client); ++ ++ ++ /* Reset pointers to be safe */ ++ pfn_ib_unregister_peer_memory_client = NULL; ++ pfn_ib_register_peer_memory_client = NULL; ++ ib_reg_handle = NULL; ++} ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +new file mode 100644 +index 0000000..ddad9be +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +@@ -0,0 +1,583 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ * ++ */ ++ ++#ifndef F32_MES_PM4_PACKETS_H ++#define F32_MES_PM4_PACKETS_H ++ ++#ifndef PM4_MES_HEADER_DEFINED ++#define PM4_MES_HEADER_DEFINED ++union PM4_MES_TYPE_3_HEADER { ++ struct { ++ uint32_t reserved1 : 8; /* < reserved */ ++ uint32_t opcode : 8; /* < IT opcode */ ++ uint32_t count : 14;/* < number of DWORDs - 1 in the ++ * information body. ++ */ ++ uint32_t type : 2; /* < packet identifier. ++ * It should be 3 for type 3 packets ++ */ ++ }; ++ uint32_t u32All; ++}; ++#endif /* PM4_MES_HEADER_DEFINED */ ++ ++/*--------------------MES_SET_RESOURCES--------------------*/ ++ ++#ifndef PM4_MES_SET_RESOURCES_DEFINED ++#define PM4_MES_SET_RESOURCES_DEFINED ++enum mes_set_resources_queue_type_enum { ++ queue_type__mes_set_resources__kernel_interface_queue_kiq = 0, ++ queue_type__mes_set_resources__hsa_interface_queue_hiq = 1, ++ queue_type__mes_set_resources__hsa_debug_interface_queue = 4 ++}; ++ ++ ++struct pm4_mes_set_resources { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t vmid_mask:16; ++ uint32_t unmap_latency:8; ++ uint32_t reserved1:5; ++ enum mes_set_resources_queue_type_enum queue_type:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t queue_mask_lo; ++ uint32_t queue_mask_hi; ++ uint32_t gws_mask_lo; ++ uint32_t gws_mask_hi; ++ ++ union { ++ struct { ++ uint32_t oac_mask:16; ++ uint32_t reserved2:16; ++ } bitfields7; ++ uint32_t ordinal7; ++ }; ++ ++ union { ++ struct { ++ uint32_t gds_heap_base:6; ++ uint32_t reserved3:5; ++ uint32_t gds_heap_size:6; ++ uint32_t reserved4:15; ++ } bitfields8; ++ uint32_t ordinal8; ++ }; ++ ++}; ++#endif ++ ++/*--------------------MES_RUN_LIST--------------------*/ ++ ++#ifndef PM4_MES_RUN_LIST_DEFINED ++#define PM4_MES_RUN_LIST_DEFINED ++ ++struct pm4_mes_runlist { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved1:2; ++ uint32_t ib_base_lo:30; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t ib_base_hi; ++ ++ union { ++ struct { ++ uint32_t ib_size:20; ++ uint32_t chain:1; ++ uint32_t offload_polling:1; ++ uint32_t reserved2:1; ++ uint32_t valid:1; ++ uint32_t process_cnt:4; ++ uint32_t reserved3:4; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++}; ++#endif ++ ++/*--------------------MES_MAP_PROCESS--------------------*/ ++ ++#ifndef PM4_MES_MAP_PROCESS_DEFINED ++#define PM4_MES_MAP_PROCESS_DEFINED ++ ++struct pm4_mes_map_process { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:8; ++ uint32_t diq_enable:1; ++ uint32_t process_quantum:7; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ uint32_t vm_context_page_table_base_addr_lo32; ++ ++ uint32_t vm_context_page_table_base_addr_hi32; ++ ++ uint32_t sh_mem_bases; ++ ++ uint32_t sh_mem_config; ++ ++ uint32_t sq_shader_tba_lo; ++ ++ uint32_t sq_shader_tba_hi; ++ ++ uint32_t sq_shader_tma_lo; ++ ++ uint32_t sq_shader_tma_hi; ++ ++ uint32_t reserved6; ++ ++ uint32_t gds_addr_lo; ++ ++ uint32_t gds_addr_hi; ++ ++ union { ++ struct { ++ uint32_t num_gws:6; ++ uint32_t reserved7:1; ++ uint32_t sdma_enable:1; ++ uint32_t num_oac:4; ++ uint32_t reserved8:4; ++ uint32_t gds_size:6; ++ uint32_t num_queues:10; ++ } bitfields14; ++ uint32_t ordinal14; ++ }; ++ ++ uint32_t completion_signal_lo; ++ ++ uint32_t completion_signal_hi; ++ ++}; ++ ++#endif ++ ++/*--------------------MES_MAP_PROCESS_VM--------------------*/ ++ ++#ifndef PM4_MES_MAP_PROCESS_VM_DEFINED ++#define PM4_MES_MAP_PROCESS_VM_DEFINED ++ ++struct PM4_MES_MAP_PROCESS_VM { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ uint32_t reserved1; ++ ++ uint32_t vm_context_cntl; ++ ++ uint32_t reserved2; ++ ++ uint32_t vm_context_page_table_end_addr_lo32; ++ ++ uint32_t vm_context_page_table_end_addr_hi32; ++ ++ uint32_t vm_context_page_table_start_addr_lo32; ++ ++ uint32_t vm_context_page_table_start_addr_hi32; ++ ++ uint32_t reserved3; ++ ++ uint32_t reserved4; ++ ++ uint32_t reserved5; ++ ++ uint32_t reserved6; ++ ++ uint32_t reserved7; ++ ++ uint32_t reserved8; ++ ++ uint32_t completion_signal_lo32; ++ ++ uint32_t completion_signal_hi32; ++ ++}; ++#endif ++ ++/*--------------------MES_MAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_MAP_QUEUES_VI_DEFINED ++#define PM4_MES_MAP_QUEUES_VI_DEFINED ++enum mes_map_queues_queue_sel_enum { ++ queue_sel__mes_map_queues__map_to_specified_queue_slots_vi = 0, ++queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi = 1 ++}; ++ ++enum mes_map_queues_queue_type_enum { ++ queue_type__mes_map_queues__normal_compute_vi = 0, ++ queue_type__mes_map_queues__debug_interface_queue_vi = 1, ++ queue_type__mes_map_queues__normal_latency_static_queue_vi = 2, ++queue_type__mes_map_queues__low_latency_static_queue_vi = 3 ++}; ++ ++enum mes_map_queues_alloc_format_enum { ++ alloc_format__mes_map_queues__one_per_pipe_vi = 0, ++alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 ++}; ++ ++enum mes_map_queues_engine_sel_enum { ++ engine_sel__mes_map_queues__compute_vi = 0, ++ engine_sel__mes_map_queues__sdma0_vi = 2, ++ engine_sel__mes_map_queues__sdma1_vi = 3 ++}; ++ ++ ++struct pm4_mes_map_queues { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved1:4; ++ enum mes_map_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:15; ++ enum mes_map_queues_queue_type_enum queue_type:3; ++ enum mes_map_queues_alloc_format_enum alloc_format:2; ++ enum mes_map_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved3:1; ++ uint32_t check_disable:1; ++ uint32_t doorbell_offset:26; ++ uint32_t reserved4:4; ++ } bitfields3; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t mqd_addr_lo; ++ uint32_t mqd_addr_hi; ++ uint32_t wptr_addr_lo; ++ uint32_t wptr_addr_hi; ++}; ++#endif ++ ++/*--------------------MES_QUERY_STATUS--------------------*/ ++ ++#ifndef PM4_MES_QUERY_STATUS_DEFINED ++#define PM4_MES_QUERY_STATUS_DEFINED ++enum mes_query_status_interrupt_sel_enum { ++ interrupt_sel__mes_query_status__completion_status = 0, ++ interrupt_sel__mes_query_status__process_status = 1, ++ interrupt_sel__mes_query_status__queue_status = 2 ++}; ++ ++enum mes_query_status_command_enum { ++ command__mes_query_status__interrupt_only = 0, ++ command__mes_query_status__fence_only_immediate = 1, ++ command__mes_query_status__fence_only_after_write_ack = 2, ++ command__mes_query_status__fence_wait_for_write_ack_send_interrupt = 3 ++}; ++ ++enum mes_query_status_engine_sel_enum { ++ engine_sel__mes_query_status__compute = 0, ++ engine_sel__mes_query_status__sdma0_queue = 2, ++ engine_sel__mes_query_status__sdma1_queue = 3 ++}; ++ ++struct pm4_mes_query_status { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ uint32_t context_id:28; ++ enum mes_query_status_interrupt_sel_enum interrupt_sel:2; ++ enum mes_query_status_command_enum command:2; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved1:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved2:2; ++ uint32_t doorbell_offset:26; ++ enum mes_query_status_engine_sel_enum engine_sel:3; ++ uint32_t reserved3:1; ++ } bitfields3b; ++ uint32_t ordinal3; ++ }; ++ ++ uint32_t addr_lo; ++ uint32_t addr_hi; ++ uint32_t data_lo; ++ uint32_t data_hi; ++}; ++#endif ++ ++/*--------------------MES_UNMAP_QUEUES--------------------*/ ++ ++#ifndef PM4_MES_UNMAP_QUEUES_DEFINED ++#define PM4_MES_UNMAP_QUEUES_DEFINED ++enum mes_unmap_queues_action_enum { ++ action__mes_unmap_queues__preempt_queues = 0, ++ action__mes_unmap_queues__reset_queues = 1, ++ action__mes_unmap_queues__disable_process_queues = 2, ++ action__mes_unmap_queues__reserved = 3 ++}; ++ ++enum mes_unmap_queues_queue_sel_enum { ++ queue_sel__mes_unmap_queues__perform_request_on_specified_queues = 0, ++ queue_sel__mes_unmap_queues__perform_request_on_pasid_queues = 1, ++ queue_sel__mes_unmap_queues__unmap_all_queues = 2, ++ queue_sel__mes_unmap_queues__unmap_all_non_static_queues = 3 ++}; ++ ++enum mes_unmap_queues_engine_sel_enum { ++ engine_sel__mes_unmap_queues__compute = 0, ++ engine_sel__mes_unmap_queues__sdma0 = 2, ++ engine_sel__mes_unmap_queues__sdmal = 3 ++}; ++ ++struct pm4_mes_unmap_queues { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /* header */ ++ uint32_t ordinal1; ++ }; ++ ++ union { ++ struct { ++ enum mes_unmap_queues_action_enum action:2; ++ uint32_t reserved1:2; ++ enum mes_unmap_queues_queue_sel_enum queue_sel:2; ++ uint32_t reserved2:20; ++ enum mes_unmap_queues_engine_sel_enum engine_sel:3; ++ uint32_t num_queues:3; ++ } bitfields2; ++ uint32_t ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t pasid:16; ++ uint32_t reserved3:16; ++ } bitfields3a; ++ struct { ++ uint32_t reserved4:2; ++ uint32_t doorbell_offset0:26; ++ int32_t reserved5:4; ++ } bitfields3b; ++ uint32_t ordinal3; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved6:2; ++ uint32_t doorbell_offset1:26; ++ uint32_t reserved7:4; ++ } bitfields4; ++ uint32_t ordinal4; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved8:2; ++ uint32_t doorbell_offset2:26; ++ uint32_t reserved9:4; ++ } bitfields5; ++ uint32_t ordinal5; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved10:2; ++ uint32_t doorbell_offset3:26; ++ uint32_t reserved11:4; ++ } bitfields6; ++ uint32_t ordinal6; ++ }; ++}; ++#endif ++ ++#ifndef PM4_MEC_RELEASE_MEM_DEFINED ++#define PM4_MEC_RELEASE_MEM_DEFINED ++ ++enum mec_release_mem_event_index_enum { ++ event_index__mec_release_mem__end_of_pipe = 5, ++ event_index__mec_release_mem__shader_done = 6 ++}; ++ ++enum mec_release_mem_cache_policy_enum { ++ cache_policy__mec_release_mem__lru = 0, ++ cache_policy__mec_release_mem__stream = 1 ++}; ++ ++enum mec_release_mem_pq_exe_status_enum { ++ pq_exe_status__mec_release_mem__default = 0, ++ pq_exe_status__mec_release_mem__phase_update = 1 ++}; ++ ++enum mec_release_mem_dst_sel_enum { ++ dst_sel__mec_release_mem__memory_controller = 0, ++ dst_sel__mec_release_mem__tc_l2 = 1, ++ dst_sel__mec_release_mem__queue_write_pointer_register = 2, ++ dst_sel__mec_release_mem__queue_write_pointer_poll_mask_bit = 3 ++}; ++ ++enum mec_release_mem_int_sel_enum { ++ int_sel__mec_release_mem__none = 0, ++ int_sel__mec_release_mem__send_interrupt_only = 1, ++ int_sel__mec_release_mem__send_interrupt_after_write_confirm = 2, ++ int_sel__mec_release_mem__send_data_after_write_confirm = 3, ++ int_sel__mec_release_mem__unconditionally_send_int_ctxid = 4, ++ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_32_bit_compare = 5, ++ int_sel__mec_release_mem__conditionally_send_int_ctxid_based_on_64_bit_compare = 6 ++}; ++ ++enum mec_release_mem_data_sel_enum { ++ data_sel__mec_release_mem__none = 0, ++ data_sel__mec_release_mem__send_32_bit_low = 1, ++ data_sel__mec_release_mem__send_64_bit_data = 2, ++ data_sel__mec_release_mem__send_gpu_clock_counter = 3, ++ data_sel__mec_release_mem__send_cp_perfcounter_hi_lo = 4, ++ data_sel__mec_release_mem__store_gds_data_to_memory = 5 ++}; ++ ++struct pm4_mec_release_mem { ++ union { ++ union PM4_MES_TYPE_3_HEADER header; /*header */ ++ unsigned int ordinal1; ++ }; ++ ++ union { ++ struct { ++ unsigned int event_type:6; ++ unsigned int reserved1:2; ++ enum mec_release_mem_event_index_enum event_index:4; ++ unsigned int tcl1_vol_action_ena:1; ++ unsigned int tc_vol_action_ena:1; ++ unsigned int reserved2:1; ++ unsigned int tc_wb_action_ena:1; ++ unsigned int tcl1_action_ena:1; ++ unsigned int tc_action_ena:1; ++ uint32_t reserved3:1; ++ uint32_t tc_nc_action_ena:1; ++ uint32_t tc_wc_action_ena:1; ++ uint32_t tc_md_action_ena:1; ++ uint32_t reserved4:3; ++ enum mec_release_mem_cache_policy_enum cache_policy:2; ++ uint32_t reserved5:2; ++ enum mec_release_mem_pq_exe_status_enum pq_exe_status:1; ++ uint32_t reserved6:2; ++ } bitfields2; ++ unsigned int ordinal2; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved7:16; ++ enum mec_release_mem_dst_sel_enum dst_sel:2; ++ uint32_t reserved8:6; ++ enum mec_release_mem_int_sel_enum int_sel:3; ++ uint32_t reserved9:2; ++ enum mec_release_mem_data_sel_enum data_sel:3; ++ } bitfields3; ++ unsigned int ordinal3; ++ }; ++ ++ union { ++ struct { ++ uint32_t reserved10:2; ++ unsigned int address_lo_32b:30; ++ } bitfields4; ++ struct { ++ uint32_t reserved11:3; ++ uint32_t address_lo_64b:29; ++ } bitfields4b; ++ uint32_t reserved12; ++ unsigned int ordinal4; ++ }; ++ ++ union { ++ uint32_t address_hi; ++ uint32_t reserved13; ++ uint32_t ordinal5; ++ }; ++ ++ union { ++ uint32_t data_lo; ++ uint32_t cmp_data_lo; ++ struct { ++ uint32_t dw_offset:16; ++ uint32_t num_dwords:16; ++ } bitfields6c; ++ uint32_t reserved14; ++ uint32_t ordinal6; ++ }; ++ ++ union { ++ uint32_t data_hi; ++ uint32_t cmp_data_hi; ++ uint32_t reserved15; ++ uint32_t reserved16; ++ uint32_t ordinal7; ++ }; ++ ++ uint32_t int_ctxid; ++ ++}; ++ ++#endif ++ ++enum { ++ CACHE_FLUSH_AND_INV_TS_EVENT = 0x00000014 ++}; ++#endif ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +index a0ff348..0b314a8 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_diq.h +@@ -77,103 +77,6 @@ struct pm4__indirect_buffer_pasid { + + #endif + +-/*--------------------_RELEASE_MEM-------------------- */ +- +-#ifndef _PM4__RELEASE_MEM_DEFINED +-#define _PM4__RELEASE_MEM_DEFINED +-enum _RELEASE_MEM_event_index_enum { +- event_index___release_mem__end_of_pipe = 5, +- event_index___release_mem__shader_done = 6 +-}; +- +-enum _RELEASE_MEM_cache_policy_enum { +- cache_policy___release_mem__lru = 0, +- cache_policy___release_mem__stream = 1, +- cache_policy___release_mem__bypass = 2 +-}; +- +-enum _RELEASE_MEM_dst_sel_enum { +- dst_sel___release_mem__memory_controller = 0, +- dst_sel___release_mem__tc_l2 = 1, +- dst_sel___release_mem__queue_write_pointer_register = 2, +- dst_sel___release_mem__queue_write_pointer_poll_mask_bit = 3 +-}; +- +-enum _RELEASE_MEM_int_sel_enum { +- int_sel___release_mem__none = 0, +- int_sel___release_mem__send_interrupt_only = 1, +- int_sel___release_mem__send_interrupt_after_write_confirm = 2, +- int_sel___release_mem__send_data_after_write_confirm = 3 +-}; +- +-enum _RELEASE_MEM_data_sel_enum { +- data_sel___release_mem__none = 0, +- data_sel___release_mem__send_32_bit_low = 1, +- data_sel___release_mem__send_64_bit_data = 2, +- data_sel___release_mem__send_gpu_clock_counter = 3, +- data_sel___release_mem__send_cp_perfcounter_hi_lo = 4, +- data_sel___release_mem__store_gds_data_to_memory = 5 +-}; +- +-struct pm4__release_mem { +- union { +- union PM4_MES_TYPE_3_HEADER header; /*header */ +- unsigned int ordinal1; +- }; +- +- union { +- struct { +- unsigned int event_type:6; +- unsigned int reserved1:2; +- enum _RELEASE_MEM_event_index_enum event_index:4; +- unsigned int tcl1_vol_action_ena:1; +- unsigned int tc_vol_action_ena:1; +- unsigned int reserved2:1; +- unsigned int tc_wb_action_ena:1; +- unsigned int tcl1_action_ena:1; +- unsigned int tc_action_ena:1; +- unsigned int reserved3:6; +- unsigned int atc:1; +- enum _RELEASE_MEM_cache_policy_enum cache_policy:2; +- unsigned int reserved4:5; +- } bitfields2; +- unsigned int ordinal2; +- }; +- +- union { +- struct { +- unsigned int reserved5:16; +- enum _RELEASE_MEM_dst_sel_enum dst_sel:2; +- unsigned int reserved6:6; +- enum _RELEASE_MEM_int_sel_enum int_sel:3; +- unsigned int reserved7:2; +- enum _RELEASE_MEM_data_sel_enum data_sel:3; +- } bitfields3; +- unsigned int ordinal3; +- }; +- +- union { +- struct { +- unsigned int reserved8:2; +- unsigned int address_lo_32b:30; +- } bitfields4; +- struct { +- unsigned int reserved9:3; +- unsigned int address_lo_64b:29; +- } bitfields5; +- unsigned int ordinal4; +- }; +- +- unsigned int address_hi; +- +- unsigned int data_lo; +- +- unsigned int data_hi; +- +-}; +-#endif +- +- + /*--------------------_SET_CONFIG_REG-------------------- */ + + #ifndef _PM4__SET_CONFIG_REG_DEFINED +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 0bedcf9..345224e 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -30,20 +30,48 @@ + #include <linux/atomic.h> + #include <linux/workqueue.h> + #include <linux/spinlock.h> +-#include <linux/kfd_ioctl.h> + #include <linux/idr.h> +-#include <linux/kfifo.h> ++#include <linux/kfd_ioctl.h> ++#include <linux/pid.h> ++#include <linux/interval_tree.h> + #include <linux/seq_file.h> + #include <linux/kref.h> ++#include <linux/kfifo.h> + #include <kgd_kfd_interface.h> + ++#include "amd_rdma.h" + #include "amd_shared.h" + + #define KFD_SYSFS_FILE_MODE 0444 + +-#define KFD_MMAP_DOORBELL_MASK 0x8000000000000ull +-#define KFD_MMAP_EVENTS_MASK 0x4000000000000ull +-#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000ull ++/* GPU ID hash width in bits */ ++#define KFD_GPU_ID_HASH_WIDTH 16 ++ ++/* Use upper bits of mmap offset to store KFD driver specific information. ++ * BITS[63:62] - Encode MMAP type ++ * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to ++ * BITS[45:40] - Reserved. Not Used. ++ * BITS[39:0] - MMAP offset value. Used by TTM. ++ * ++ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these ++ * defines are w.r.t to PAGE_SIZE ++ */ ++#define KFD_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) ++#define KFD_MMAP_TYPE_MASK (0x3ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) ++#define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) ++ ++#define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) ++#define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ ++ << KFD_MMAP_GPU_ID_SHIFT) ++#define KFD_MMAP_GPU_ID(gpu_id) ((((uint64_t)gpu_id) << KFD_MMAP_GPU_ID_SHIFT)\ ++ & KFD_MMAP_GPU_ID_MASK) ++#define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ ++ >> KFD_MMAP_GPU_ID_SHIFT) ++ ++#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) ++#define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) + + /* + * When working with cp scheduler we should assign the HIQ manually or via +@@ -55,8 +83,6 @@ + #define KFD_CIK_HIQ_PIPE 4 + #define KFD_CIK_HIQ_QUEUE 0 + +-/* GPU ID hash width in bits */ +-#define KFD_GPU_ID_HASH_WIDTH 16 + + /* Macro for allocating structures */ + #define kfd_alloc_struct(ptr_to_struct) \ +@@ -90,14 +116,14 @@ extern int max_num_of_queues_per_device; + /* Kernel module parameter to specify the scheduling policy */ + extern int sched_policy; + ++extern int cwsr_enable; ++ + /* + * Kernel module parameter to specify the maximum process + * number per HW scheduler + */ + extern int hws_max_conc_proc; + +-extern int cwsr_enable; +- + /* + * Kernel module parameter to specify whether to send sigterm to HSA process on + * unhandled exception +@@ -105,11 +131,27 @@ extern int cwsr_enable; + extern int send_sigterm; + + /* ++ * This kernel module is used to simulate large bar machine on non-large bar ++ * enabled machines. ++ */ ++extern int debug_largebar; ++ ++/* + * Ignore CRAT table during KFD initialization, can be used to work around + * broken CRAT tables on some AMD systems + */ + extern int ignore_crat; + ++/* ++ * Set sh_mem_config.retry_disable on Vega10 ++ */ ++extern int vega10_noretry; ++ ++/* ++ * Enable privileged mode for all CP queues including user queues ++ */ ++extern int priv_cp_queues; ++ + /** + * enum kfd_sched_policy + * +@@ -142,11 +184,14 @@ enum cache_policy { + cache_policy_noncoherent + }; + ++#define KFD_IS_SOC15(chip) ((chip) >= CHIP_VEGA10) ++ + struct kfd_event_interrupt_class { + bool (*interrupt_isr)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry); ++ const uint32_t *ih_ring_entry, uint32_t *patched_ihre, ++ bool *patched_flag); + void (*interrupt_wq)(struct kfd_dev *dev, +- const uint32_t *ih_ring_entry); ++ const uint32_t *ih_ring_entry); + }; + + struct kfd_device_info { +@@ -154,10 +199,15 @@ struct kfd_device_info { + const struct kfd_event_interrupt_class *event_interrupt_class; + unsigned int max_pasid_bits; + unsigned int max_no_of_hqd; ++ unsigned int doorbell_size; + size_t ih_ring_entry_size; + uint8_t num_of_watch_points; + uint16_t mqd_size_aligned; + bool supports_cwsr; ++ bool needs_iommu_device; ++ bool needs_pci_atomics; ++ /* obtain from adev->sdma.num_instances */ ++ unsigned int num_sdma_engines; + }; + + struct kfd_mem_obj { +@@ -165,6 +215,7 @@ struct kfd_mem_obj { + uint32_t range_end; + uint64_t gpu_addr; + uint32_t *cpu_ptr; ++ void *gtt_mem; + }; + + struct kfd_vmid_info { +@@ -219,6 +270,7 @@ struct kfd_dev { + struct device_queue_manager *dqm; + + bool init_complete; ++ + /* + * Interrupts of interest to KFD are copied + * from the HW ring into a SW ring. +@@ -226,7 +278,10 @@ struct kfd_dev { + bool interrupts_active; + + /* Debug manager */ +- struct kfd_dbgmgr *dbgmgr; ++ struct kfd_dbgmgr *dbgmgr; ++ ++ /* MEC firmware version*/ ++ uint16_t mec_fw_version; + + /* Maximum process number mapped to HW scheduler */ + unsigned int max_proc_per_quantum; +@@ -237,6 +292,16 @@ struct kfd_dev { + unsigned int cwsr_isa_size; + }; + ++struct kfd_ipc_obj; ++ ++struct kfd_bo { ++ void *mem; ++ struct interval_tree_node it; ++ struct kfd_dev *dev; ++ struct list_head cb_data_head; ++ struct kfd_ipc_obj *kfd_ipc_obj; ++}; ++ + /* KGD2KFD callbacks */ + void kgd2kfd_exit(void); + struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, +@@ -298,6 +363,11 @@ enum kfd_queue_format { + KFD_QUEUE_FORMAT_AQL + }; + ++enum KFD_QUEUE_PRIORITY { ++ KFD_QUEUE_PRIORITY_MINIMUM = 0, ++ KFD_QUEUE_PRIORITY_MAXIMUM = 15 ++}; ++ + /** + * struct queue_properties + * +@@ -352,9 +422,10 @@ struct queue_properties { + uint32_t queue_percent; + uint32_t *read_ptr; + uint32_t *write_ptr; +- uint32_t __iomem *doorbell_ptr; ++ void __iomem *doorbell_ptr; + uint32_t doorbell_off; + bool is_interop; ++ bool is_evicted; /* true -> queue is evicted */ + bool is_active; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; +@@ -370,6 +441,9 @@ struct queue_properties { + uint32_t ctl_stack_size; + uint64_t tba_addr; + uint64_t tma_addr; ++ /* Relevant for CU */ ++ uint32_t cu_mask_count; /* Must be a multiple of 32 */ ++ uint32_t *cu_mask; + }; + + /** +@@ -414,6 +488,7 @@ struct queue { + uint32_t queue; + + unsigned int sdma_id; ++ unsigned int doorbell_id; + + struct kfd_process *process; + struct kfd_dev *device; +@@ -430,6 +505,19 @@ enum KFD_MQD_TYPE { + KFD_MQD_TYPE_MAX + }; + ++enum KFD_PIPE_PRIORITY { ++ KFD_PIPE_PRIORITY_CS_LOW = 0, ++ KFD_PIPE_PRIORITY_CS_MEDIUM, ++ KFD_PIPE_PRIORITY_CS_HIGH ++}; ++ ++enum KFD_SPI_PRIORITY { ++ KFD_SPI_PRIORITY_EXTRA_LOW = 0, ++ KFD_SPI_PRIORITY_LOW, ++ KFD_SPI_PRIORITY_MEDIUM, ++ KFD_SPI_PRIORITY_HIGH ++}; ++ + struct scheduling_resources { + unsigned int vmid_mask; + enum kfd_queue_type type; +@@ -456,8 +544,10 @@ struct qcm_process_device { + struct list_head priv_queue_list; + + unsigned int queue_count; ++ /* a data field only meaningful for non-HWS case */ + unsigned int vmid; + bool is_debug; ++ unsigned int evicted; /* eviction counter, 0=active */ + + /* This flag tells if we should reset all wavefronts on + * process termination +@@ -480,10 +570,40 @@ struct qcm_process_device { + + /* CWSR memory */ + void *cwsr_kaddr; ++ uint64_t cwsr_base; + uint64_t tba_addr; + uint64_t tma_addr; ++ ++ /* IB memory */ ++ uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ ++ void *ib_kaddr; ++ ++ /*doorbell resources per process per device*/ ++ unsigned long *doorbell_bitmap; + }; + ++/* KFD Memory Eviction */ ++ ++/* Approx. wait time before attempting to restore evicted BOs */ ++#define PROCESS_RESTORE_TIME_MS 100 ++/* Approx. back off time if restore fails due to lack of memory */ ++#define PROCESS_BACK_OFF_TIME_MS 100 ++/* Approx. time before evicting the process again */ ++#define PROCESS_ACTIVE_TIME_MS 10 ++ ++int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, ++ struct dma_fence *fence); ++int kfd_process_evict_queues(struct kfd_process *p); ++int kfd_process_restore_queues(struct kfd_process *p); ++ ++ ++/* 8 byte handle containing GPU ID in the most significant 4 bytes and ++ * idr_handle in the least significant 4 bytes ++ */ ++#define MAKE_HANDLE(gpu_id, idr_handle) \ ++ (((uint64_t)(gpu_id) << 32) + idr_handle) ++#define GET_GPU_ID(handle) (handle >> 32) ++#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF) + + enum kfd_pdd_bound { + PDD_UNBOUND = 0, +@@ -516,8 +636,11 @@ struct kfd_process_device { + uint64_t scratch_base; + uint64_t scratch_limit; + +- /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ +- enum kfd_pdd_bound bound; ++ /* VM context for GPUVM allocations */ ++ void *vm; ++ ++ /* GPUVM allocations storage */ ++ struct idr alloc_idr; + + /* Flag used to tell the pdd has dequeued from the dqm. + * This is used to prevent dev->dqm->ops.process_termination() from +@@ -525,6 +648,9 @@ struct kfd_process_device { + * function. + */ + bool already_dequeued; ++ ++ /* Is this process/pasid bound to this device? (amd_iommu_bind_pasid) */ ++ enum kfd_pdd_bound bound; + }; + + #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) +@@ -584,11 +710,34 @@ struct kfd_process { + struct idr event_idr; + /* Event page */ + struct kfd_signal_page *signal_page; +- size_t signal_mapped_size; + size_t signal_event_count; + bool signal_event_limit_reached; ++ ++ struct rb_root_cached bo_interval_tree; ++ ++ /* Information used for memory eviction */ ++ void *process_info; ++ /* Eviction fence that is attached to all the BOs of this process. The ++ * fence will be triggered during eviction and new one will be created ++ * during restore ++ */ ++ struct dma_fence *ef; ++ ++ /* Work items for evicting and restoring BOs */ ++ struct delayed_work eviction_work; ++ struct delayed_work restore_work; ++ /* seqno of the last scheduled eviction */ ++ unsigned int last_eviction_seqno; ++ /* Approx. the last timestamp (in jiffies) when the process was ++ * restored after an eviction ++ */ ++ unsigned long last_restore_timestamp; + }; + ++#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ ++extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); ++extern struct srcu_struct kfd_processes_srcu; ++ + /** + * Ioctl function type. + * +@@ -607,25 +756,43 @@ struct amdkfd_ioctl_desc { + const char *name; + }; + +-void kfd_process_create_wq(void); ++int kfd_process_create_wq(void); + void kfd_process_destroy_wq(void); + struct kfd_process *kfd_create_process(struct file *filep); +-struct kfd_process *kfd_get_process(const struct task_struct *); ++struct kfd_process *kfd_get_process(const struct task_struct *task); + struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); ++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); + void kfd_unref_process(struct kfd_process *p); ++void kfd_suspend_all_processes(void); ++int kfd_resume_all_processes(void); + + struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, +- struct kfd_process *p); +-int kfd_bind_processes_to_device(struct kfd_dev *dev); +-void kfd_unbind_processes_from_device(struct kfd_dev *dev); +-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid); ++ struct kfd_process *p); + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + struct kfd_process *p); + + int kfd_reserved_mem_mmap(struct kfd_process *process, +- struct vm_area_struct *vma); ++ struct vm_area_struct *vma); ++ ++/* KFD process API for creating and translating handles */ ++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, ++ void *mem, uint64_t start, ++ uint64_t length, ++ struct kfd_ipc_obj *ipc_obj); ++void *kfd_process_device_translate_handle(struct kfd_process_device *p, ++ int handle); ++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, ++ int handle); ++void *kfd_process_find_bo_from_interval(struct kfd_process *p, ++ uint64_t start_addr, ++ uint64_t last_addr); ++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, ++ int handle); ++ ++void run_rdma_free_callback(struct kfd_bo *buf_obj); ++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid); + + /* Process device data iterator */ + struct kfd_process_device *kfd_get_first_process_device_data( +@@ -644,17 +811,20 @@ unsigned int kfd_pasid_alloc(void); + void kfd_pasid_free(unsigned int pasid); + + /* Doorbells */ ++size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); + int kfd_doorbell_init(struct kfd_dev *kfd); + void kfd_doorbell_fini(struct kfd_dev *kfd); +-int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma); +-u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, ++int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, ++ struct vm_area_struct *vma); ++void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, + unsigned int *doorbell_off); + void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr); + u32 read_kernel_doorbell(u32 __iomem *db); +-void write_kernel_doorbell(u32 __iomem *db, u32 value); +-unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd, ++void write_kernel_doorbell(void __iomem *db, u32 value); ++void write_kernel_doorbell64(void __iomem *db, u64 value); ++unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, + struct kfd_process *process, +- unsigned int queue_id); ++ unsigned int doorbell_id); + phys_addr_t kfd_get_process_doorbells(struct kfd_dev *dev, + struct kfd_process *process); + int kfd_alloc_process_doorbells(struct kfd_process *process); +@@ -678,6 +848,7 @@ struct kfd_topology_device *kfd_topology_device_by_proximity_domain( + uint32_t proximity_domain); + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); + struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); ++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd); + int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); + int kfd_numa_node_to_apic_id(int numa_node_id); + +@@ -686,14 +857,22 @@ int kfd_interrupt_init(struct kfd_dev *dev); + void kfd_interrupt_exit(struct kfd_dev *dev); + void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); + bool enqueue_ih_ring_entry(struct kfd_dev *kfd, const void *ih_ring_entry); +-bool interrupt_is_wanted(struct kfd_dev *dev, const uint32_t *ih_ring_entry); ++bool interrupt_is_wanted(struct kfd_dev *dev, ++ const uint32_t *ih_ring_entry, ++ uint32_t *patched_ihre, bool *flag); + + /* Power Management */ + void kgd2kfd_suspend(struct kfd_dev *kfd); + int kgd2kfd_resume(struct kfd_dev *kfd); + ++/* GPU reset */ ++int kgd2kfd_pre_reset(struct kfd_dev *kfd); ++int kgd2kfd_post_reset(struct kfd_dev *kfd); ++ + /* amdkfd Apertures */ + int kfd_init_apertures(struct kfd_process *process); ++int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, ++ uint64_t base, uint64_t limit); + + /* Queue Context Management */ + int init_queue(struct queue **q, const struct queue_properties *properties); +@@ -705,13 +884,20 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); ++struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev); + struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); ++struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev); ++struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, ++ struct kfd_dev *dev); + struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); + void device_queue_manager_uninit(struct device_queue_manager *dqm); + struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, + enum kfd_queue_type type); + void kernel_queue_uninit(struct kernel_queue *kq); ++int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid); + + /* Process Queue Manager */ + struct process_queue_node { +@@ -732,8 +918,17 @@ int pqm_create_queue(struct process_queue_manager *pqm, + int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid); + int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + struct queue_properties *p); ++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, ++ struct queue_properties *p); + struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, + unsigned int qid); ++int pqm_get_wave_state(struct process_queue_manager *pqm, ++ unsigned int qid, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size); ++int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); ++int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); + + int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + unsigned int fence_value, +@@ -744,6 +939,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, + #define KFD_FENCE_COMPLETED (100) + #define KFD_FENCE_INIT (10) + ++struct packet_manager_func; ++ + struct packet_manager { + struct device_queue_manager *dqm; + struct kernel_queue *priv_queue; +@@ -751,9 +948,41 @@ struct packet_manager { + bool allocated; + struct kfd_mem_obj *ib_buffer_obj; + unsigned int ib_size_bytes; ++ ++ struct packet_manager_funcs *pmf; + }; + +-int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); ++struct packet_manager_funcs { ++ /* Support different firmware versions for PM4 packets */ ++ int (*map_process)(struct packet_manager *pm, uint32_t *buffer, ++ struct qcm_process_device *qpd); ++ int (*runlist)(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain); ++ int (*set_resources)(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res); ++ int (*map_queues)(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static); ++ int (*unmap_queues)(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter mode, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine); ++ int (*query_status)(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value); ++ uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); ++ ++ uint32_t (*get_map_process_packet_size)(void); ++ uint32_t (*get_runlist_packet_size)(void); ++ uint32_t (*get_set_resources_packet_size)(void); ++ uint32_t (*get_map_queues_packet_size)(void); ++ uint32_t (*get_unmap_queues_packet_size)(void); ++ uint32_t (*get_query_status_packet_size)(void); ++ uint32_t (*get_release_mem_packet_size)(void); ++ ++}; ++ ++int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, ++ uint16_t fw_ver); + void pm_uninit(struct packet_manager *pm); + int pm_send_set_resources(struct packet_manager *pm, + struct scheduling_resources *res); +@@ -768,10 +997,44 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, + + void pm_release_ib(struct packet_manager *pm); + ++/* Following PM funcs can be shared among CIK and VI */ ++unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); ++int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t ib, size_t ib_size_in_dwords, bool chain); ++int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct queue *q, bool is_static); ++int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, ++ struct scheduling_resources *res); ++int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, ++ enum kfd_queue_type type, ++ enum kfd_unmap_queues_filter filter, ++ uint32_t filter_param, bool reset, ++ unsigned int sdma_engine); ++int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, ++ uint64_t fence_address, uint32_t fence_value); ++uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); ++ ++uint32_t pm_get_map_process_packet_size_vi(void); ++uint32_t pm_get_runlist_packet_size_vi(void); ++uint32_t pm_get_set_resources_packet_size_vi(void); ++uint32_t pm_get_map_queues_packet_size_vi(void); ++uint32_t pm_get_unmap_queues_packet_size_vi(void); ++uint32_t pm_get_query_status_packet_size_vi(void); ++uint32_t pm_get_release_mem_packet_size_vi(void); ++ ++ ++void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); ++void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); ++ ++void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); ++ ++ + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); + + /* Events */ + extern const struct kfd_event_interrupt_class event_interrupt_class_cik; ++extern const struct kfd_event_interrupt_class event_interrupt_class_v9; ++ + extern const struct kfd_device_global_init_class device_global_init_class_cik; + + void kfd_event_init_process(struct kfd_process *p); +@@ -792,11 +1055,26 @@ int kfd_reset_event(struct kfd_process *p, uint32_t event_id); + int kfd_event_create(struct file *devkfd, struct kfd_process *p, + uint32_t event_type, bool auto_reset, uint32_t node_id, + uint32_t *event_id, uint32_t *event_trigger_data, +- uint64_t *event_page_offset, uint32_t *event_slot_index); ++ uint64_t *event_page_offset, uint32_t *event_slot_index, ++ void *kern_addr); + int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); + ++void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ++ struct kfd_vm_fault_info *info); ++ ++void kfd_flush_tlb(struct kfd_process_device *pdd); ++ + int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); + ++#define KFD_SCRATCH_KV_FW_VER 413 ++ ++/* PeerDirect support */ ++void kfd_init_peer_direct(void); ++void kfd_close_peer_direct(void); ++ ++/* IPC Support */ ++int kfd_ipc_init(void); ++ + /* Debugfs */ + #if defined(CONFIG_DEBUG_FS) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index 4ff5f0f..7c6bcbd 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -30,35 +30,58 @@ + #include <linux/notifier.h> + #include <linux/compat.h> + #include <linux/mman.h> ++#include <asm/page.h> ++#include "kfd_ipc.h" + + struct mm_struct; + + #include "kfd_priv.h" ++#include "kfd_device_queue_manager.h" + #include "kfd_dbgmgr.h" ++#include "kfd_iommu.h" + + /* + * List of struct kfd_process (field kfd_process). + * Unique/indexed by mm_struct* + */ +-#define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ +-static DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); ++DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); + static DEFINE_MUTEX(kfd_processes_mutex); + +-DEFINE_STATIC_SRCU(kfd_processes_srcu); ++DEFINE_SRCU(kfd_processes_srcu); + ++/* For process termination handling */ + static struct workqueue_struct *kfd_process_wq; + +-static struct kfd_process *find_process(const struct task_struct *thread); ++/* Ordered, single-threaded workqueue for restoring evicted ++ * processes. Restoring multiple processes concurrently under memory ++ * pressure can lead to processes blocking each other from validating ++ * their BOs and result in a live-lock situation where processes ++ * remain evicted indefinitely. ++ */ ++static struct workqueue_struct *kfd_restore_wq; ++ ++#define MIN_IDR_ID 1 ++#define MAX_IDR_ID 0 /*0 - for unlimited*/ ++ ++static struct kfd_process *find_process(const struct task_struct *thread, ++ bool ref); + static void kfd_process_ref_release(struct kref *ref); + static struct kfd_process *create_process(const struct task_struct *thread, + struct file *filep); + static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep); + ++static void evict_process_worker(struct work_struct *work); ++static void restore_process_worker(struct work_struct *work); + +-void kfd_process_create_wq(void) ++ ++int kfd_process_create_wq(void) + { + if (!kfd_process_wq) + kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0); ++ if (!kfd_restore_wq) ++ kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0); ++ ++ return kfd_process_wq && kfd_restore_wq ? 0 : -ENOMEM; + } + + void kfd_process_destroy_wq(void) +@@ -67,11 +90,140 @@ void kfd_process_destroy_wq(void) + destroy_workqueue(kfd_process_wq); + kfd_process_wq = NULL; + } ++ if (kfd_restore_wq) { ++ destroy_workqueue(kfd_restore_wq); ++ kfd_restore_wq = NULL; ++ } ++} ++ ++static void kfd_process_free_gpuvm(struct kgd_mem *mem, ++ struct kfd_process_device *pdd) ++{ ++ struct kfd_dev *dev = pdd->dev; ++ ++ dev->kfd2kgd->unmap_memory_to_gpu(dev->kgd, mem, pdd->vm); ++ dev->kfd2kgd->free_memory_of_gpu(dev->kgd, mem); ++} ++ ++/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process ++ * This function should be only called right after the process ++ * is created and when kfd_processes_mutex is still being held ++ * to avoid concurrency. Because of that exclusiveness, we do ++ * not need to take p->mutex. ++ */ ++static int kfd_process_alloc_gpuvm(struct kfd_process *p, ++ struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size, ++ void **kptr, struct kfd_process_device *pdd, uint32_t flags) ++{ ++ int err; ++ void *mem = NULL; ++ int handle; ++ ++ err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, ++ pdd->vm, ++ (struct kgd_mem **)&mem, NULL, flags); ++ if (err) ++ goto err_alloc_mem; ++ ++ err = kdev->kfd2kgd->map_memory_to_gpu( ++ kdev->kgd, (struct kgd_mem *)mem, pdd->vm); ++ if (err) ++ goto err_map_mem; ++ ++ err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem, ++ true); ++ if (err) { ++ pr_debug("Sync memory failed, wait interrupted by user signal\n"); ++ goto sync_memory_failed; ++ } ++ ++ /* Create an obj handle so kfd_process_device_remove_obj_handle ++ * will take care of the bo removal when the process finishes. ++ * We do not need to take p->mutex, because the process is just ++ * created and the ioctls have not had the chance to run. ++ */ ++ handle = kfd_process_device_create_obj_handle( ++ pdd, mem, gpu_va, size, NULL); ++ ++ if (handle < 0) { ++ err = handle; ++ goto free_gpuvm; ++ } ++ ++ if (kptr) { ++ err = kdev->kfd2kgd->map_gtt_bo_to_kernel(kdev->kgd, ++ (struct kgd_mem *)mem, kptr, NULL); ++ if (err) { ++ pr_debug("Map GTT BO to kernel failed\n"); ++ goto free_obj_handle; ++ } ++ } ++ ++ return err; ++ ++free_obj_handle: ++ kfd_process_device_remove_obj_handle(pdd, handle); ++free_gpuvm: ++sync_memory_failed: ++ kfd_process_free_gpuvm(mem, pdd); ++ return err; ++ ++err_map_mem: ++ kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem); ++err_alloc_mem: ++ *kptr = NULL; ++ return err; ++} ++ ++/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage ++ * The memory reserved is for KFD to submit IB to AMDGPU from kernel. ++ * If the memory is reserved successfully, ib_kaddr_assigned will have ++ * the CPU/kernel address. Check ib_kaddr_assigned before accessing the ++ * memory. ++ */ ++static int kfd_process_reserve_ib_mem(struct kfd_process *p) ++{ ++ int ret = 0; ++ struct kfd_process_device *pdd = NULL; ++ struct kfd_dev *kdev = NULL; ++ struct qcm_process_device *qpd = NULL; ++ void *kaddr; ++ uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | ++ ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ++ ALLOC_MEM_FLAGS_EXECUTE_ACCESS; ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ kdev = pdd->dev; ++ qpd = &pdd->qpd; ++ if (qpd->ib_kaddr) ++ continue; ++ ++ if (qpd->ib_base) { /* is dGPU */ ++ ret = kfd_process_alloc_gpuvm(p, kdev, ++ qpd->ib_base, PAGE_SIZE, ++ &kaddr, pdd, flags); ++ if (!ret) ++ qpd->ib_kaddr = kaddr; ++ else ++ /* In case of error, the kfd_bos for some pdds ++ * which are already allocated successfully ++ * will be freed in upper level function ++ * i.e. create_process(). ++ */ ++ return ret; ++ } else { ++ /* FIXME: Support APU */ ++ continue; ++ } ++ } ++ ++ return 0; + } + + struct kfd_process *kfd_create_process(struct file *filep) + { + struct kfd_process *process; ++ + struct task_struct *thread = current; + + if (!thread->mm) +@@ -89,7 +241,7 @@ struct kfd_process *kfd_create_process(struct file *filep) + mutex_lock(&kfd_processes_mutex); + + /* A prior open of /dev/kfd could have already created the process. */ +- process = find_process(thread); ++ process = find_process(thread, false); + if (process) + pr_debug("Process already found\n"); + else +@@ -111,7 +263,7 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) + if (thread->group_leader->mm != thread->mm) + return ERR_PTR(-EINVAL); + +- process = find_process(thread); ++ process = find_process(thread, false); + + return process; + } +@@ -128,13 +280,16 @@ static struct kfd_process *find_process_by_mm(const struct mm_struct *mm) + return NULL; + } + +-static struct kfd_process *find_process(const struct task_struct *thread) ++static struct kfd_process *find_process(const struct task_struct *thread, ++ bool ref) + { + struct kfd_process *p; + int idx; + + idx = srcu_read_lock(&kfd_processes_srcu); + p = find_process_by_mm(thread->mm); ++ if (p && ref) ++ kref_get(&p->ref); + srcu_read_unlock(&kfd_processes_srcu, idx); + + return p; +@@ -145,21 +300,70 @@ void kfd_unref_process(struct kfd_process *p) + kref_put(&p->ref, kfd_process_ref_release); + } + ++/* This increments the process->ref counter. */ ++struct kfd_process *kfd_lookup_process_by_pid(struct pid *pid) ++{ ++ struct task_struct *task = NULL; ++ struct kfd_process *p = NULL; ++ ++ if (!pid) ++ task = current; ++ else ++ task = get_pid_task(pid, PIDTYPE_PID); ++ ++ if (task) ++ p = find_process(task, true); ++ ++ return p; ++} ++ ++static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) ++{ ++ struct kfd_process_device *pdd, *peer_pdd; ++ struct kfd_bo *buf_obj; ++ int id; ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ /* ++ * Remove all handles from idr and release appropriate ++ * local memory object ++ */ ++ idr_for_each_entry(&pdd->alloc_idr, buf_obj, id) { ++ list_for_each_entry(peer_pdd, &p->per_device_data, ++ per_device_list) { ++ peer_pdd->dev->kfd2kgd->unmap_memory_to_gpu( ++ peer_pdd->dev->kgd, ++ buf_obj->mem, peer_pdd->vm); ++ } ++ ++ run_rdma_free_callback(buf_obj); ++ pdd->dev->kfd2kgd->free_memory_of_gpu( ++ pdd->dev->kgd, buf_obj->mem); ++ kfd_process_device_remove_obj_handle(pdd, id); ++ } ++ } ++} ++ + static void kfd_process_destroy_pdds(struct kfd_process *p) + { + struct kfd_process_device *pdd, *temp; + + list_for_each_entry_safe(pdd, temp, &p->per_device_data, + per_device_list) { +- pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", +- pdd->dev->id, p->pasid); ++ /* Destroy the GPUVM VM context */ ++ if (pdd->vm) ++ pdd->dev->kfd2kgd->destroy_process_vm( ++ pdd->dev->kgd, pdd->vm); + + list_del(&pdd->per_device_list); + +- if (pdd->qpd.cwsr_kaddr) ++ if (pdd->qpd.cwsr_kaddr && !pdd->qpd.cwsr_base) + free_pages((unsigned long)pdd->qpd.cwsr_kaddr, + get_order(KFD_CWSR_TBA_TMA_SIZE)); + ++ kfree(pdd->qpd.doorbell_bitmap); ++ idr_destroy(&pdd->alloc_idr); ++ + kfree(pdd); + } + } +@@ -173,16 +377,13 @@ static void kfd_process_wq_release(struct work_struct *work) + { + struct kfd_process *p = container_of(work, struct kfd_process, + release_work); +- struct kfd_process_device *pdd; + +- pr_debug("Releasing process (pasid %d) in workqueue\n", p->pasid); ++ kfd_iommu_unbind_process(p); + +- list_for_each_entry(pdd, &p->per_device_data, per_device_list) { +- if (pdd->bound == PDD_BOUND) +- amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid); +- } ++ kfd_process_free_outstanding_kfd_bos(p); + + kfd_process_destroy_pdds(p); ++ dma_fence_put(p->ef); + + kfd_event_free_process(p); + +@@ -200,6 +401,9 @@ static void kfd_process_ref_release(struct kref *ref) + { + struct kfd_process *p = container_of(ref, struct kfd_process, ref); + ++ if (WARN_ON(!kfd_process_wq)) ++ return; ++ + INIT_WORK(&p->release_work, kfd_process_wq_release); + queue_work(kfd_process_wq, &p->release_work); + } +@@ -230,6 +434,9 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, + mutex_unlock(&kfd_processes_mutex); + synchronize_srcu(&kfd_processes_srcu); + ++ cancel_delayed_work_sync(&p->eviction_work); ++ cancel_delayed_work_sync(&p->restore_work); ++ + mutex_lock(&p->mutex); + + /* Iterate over all process device data structures and if the +@@ -267,28 +474,50 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { + + static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep) + { ++ int ret; + unsigned long offset; + struct kfd_process_device *pdd = NULL; + struct kfd_dev *dev = NULL; + struct qcm_process_device *qpd = NULL; ++ void *kaddr; ++ uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | ++ ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ++ ALLOC_MEM_FLAGS_READONLY | ++ ALLOC_MEM_FLAGS_EXECUTE_ACCESS; + + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { + dev = pdd->dev; + qpd = &pdd->qpd; + if (!dev->cwsr_enabled || qpd->cwsr_kaddr) + continue; +- offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT; +- qpd->tba_addr = (int64_t)vm_mmap(filep, 0, +- KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, +- MAP_SHARED, offset); +- +- if (IS_ERR_VALUE(qpd->tba_addr)) { +- int err = qpd->tba_addr; +- +- pr_err("Failure to set tba address. error %d.\n", err); +- qpd->tba_addr = 0; +- qpd->cwsr_kaddr = NULL; +- return err; ++ if (qpd->cwsr_base) { ++ /* cwsr_base is only set for DGPU */ ++ ret = kfd_process_alloc_gpuvm(p, dev, qpd->cwsr_base, ++ KFD_CWSR_TBA_TMA_SIZE, &kaddr, pdd, flags); ++ if (!ret) { ++ qpd->cwsr_kaddr = kaddr; ++ qpd->tba_addr = qpd->cwsr_base; ++ } else ++ /* In case of error, the kfd_bos for some pdds ++ * which are already allocated successfully ++ * will be freed in upper level function ++ * i.e. create_process(). ++ */ ++ return ret; ++ } else { ++ offset = (dev->id | ++ KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; ++ qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, ++ KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, ++ MAP_SHARED, offset); ++ ++ if (IS_ERR_VALUE(qpd->tba_addr)) { ++ pr_err("Failure to set tba address. error -%d.\n", ++ (int)qpd->tba_addr); ++ qpd->tba_addr = 0; ++ qpd->cwsr_kaddr = NULL; ++ return -ENOMEM; ++ } + } + + memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); +@@ -312,6 +541,8 @@ static struct kfd_process *create_process(const struct task_struct *thread, + if (!process) + goto err_alloc_process; + ++ process->bo_interval_tree = RB_ROOT_CACHED; ++ + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; +@@ -351,13 +582,27 @@ static struct kfd_process *create_process(const struct task_struct *thread, + if (err != 0) + goto err_init_apertures; + ++ INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); ++ INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); ++ process->last_restore_timestamp = get_jiffies_64(); ++ ++ err = kfd_process_reserve_ib_mem(process); ++ if (err) ++ goto err_reserve_ib_mem; + err = kfd_process_init_cwsr(process, filep); + if (err) + goto err_init_cwsr; + ++ /* If PeerDirect interface was not detected try to detect it again ++ * in case if network driver was loaded later. ++ */ ++ kfd_init_peer_direct(); ++ + return process; + + err_init_cwsr: ++err_reserve_ib_mem: ++ kfd_process_free_outstanding_kfd_bos(process); + kfd_process_destroy_pdds(process); + err_init_apertures: + pqm_uninit(&process->pqm); +@@ -376,6 +621,31 @@ static struct kfd_process *create_process(const struct task_struct *thread, + return ERR_PTR(err); + } + ++static int init_doorbell_bitmap(struct qcm_process_device *qpd, ++ struct kfd_dev *dev) ++{ ++ unsigned int i; ++ ++ if (!KFD_IS_SOC15(dev->device_info->asic_family)) ++ return 0; ++ ++ qpd->doorbell_bitmap = ++ kzalloc(DIV_ROUND_UP(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, ++ BITS_PER_BYTE), GFP_KERNEL); ++ if (!qpd->doorbell_bitmap) ++ return -ENOMEM; ++ ++ /* Mask out any reserved doorbells */ ++ for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS; i++) ++ if ((dev->shared_resources.reserved_doorbell_mask & i) == ++ dev->shared_resources.reserved_doorbell_val) { ++ set_bit(i, qpd->doorbell_bitmap); ++ pr_debug("reserved doorbell 0x%03x\n", i); ++ } ++ ++ return 0; ++} ++ + struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, + struct kfd_process *p) + { +@@ -402,12 +672,33 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); + pdd->qpd.dqm = dev->dqm; + pdd->qpd.pqm = &p->pqm; ++ pdd->qpd.evicted = 0; + pdd->process = p; + pdd->bound = PDD_UNBOUND; + pdd->already_dequeued = false; + list_add(&pdd->per_device_list, &p->per_device_data); + ++ /* Init idr used for memory handle translation */ ++ idr_init(&pdd->alloc_idr); ++ if (init_doorbell_bitmap(&pdd->qpd, dev)) { ++ pr_err("Failed to init doorbell for process\n"); ++ goto err_create_pdd; ++ } ++ ++ /* Create the GPUVM context for this specific device */ ++ if (dev->kfd2kgd->create_process_vm(dev->kgd, &pdd->vm, ++ &p->process_info, &p->ef)) { ++ pr_err("Failed to create process VM object\n"); ++ goto err_create_pdd; ++ } + return pdd; ++ ++err_create_pdd: ++ kfree(pdd->qpd.doorbell_bitmap); ++ idr_destroy(&pdd->alloc_idr); ++ list_del(&pdd->per_device_list); ++ kfree(pdd); ++ return NULL; + } + + /* +@@ -429,178 +720,377 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, + return ERR_PTR(-ENOMEM); + } + +- if (pdd->bound == PDD_BOUND) { +- return pdd; +- } else if (unlikely(pdd->bound == PDD_BOUND_SUSPENDED)) { +- pr_err("Binding PDD_BOUND_SUSPENDED pdd is unexpected!\n"); +- return ERR_PTR(-EINVAL); +- } +- +- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, p->lead_thread); +- if (err < 0) ++ err = kfd_iommu_bind_process_to_device(pdd); ++ if (err) + return ERR_PTR(err); + +- pdd->bound = PDD_BOUND; +- + return pdd; + } + +-/* +- * Bind processes do the device that have been temporarily unbound +- * (PDD_BOUND_SUSPENDED) in kfd_unbind_processes_from_device. ++struct kfd_process_device *kfd_get_first_process_device_data( ++ struct kfd_process *p) ++{ ++ return list_first_entry(&p->per_device_data, ++ struct kfd_process_device, ++ per_device_list); ++} ++ ++struct kfd_process_device *kfd_get_next_process_device_data( ++ struct kfd_process *p, ++ struct kfd_process_device *pdd) ++{ ++ if (list_is_last(&pdd->per_device_list, &p->per_device_data)) ++ return NULL; ++ return list_next_entry(pdd, per_device_list); ++} ++ ++bool kfd_has_process_device_data(struct kfd_process *p) ++{ ++ return !(list_empty(&p->per_device_data)); ++} ++ ++/* Create specific handle mapped to mem from process local memory idr ++ * Assumes that the process lock is held. + */ +-int kfd_bind_processes_to_device(struct kfd_dev *dev) ++int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, ++ void *mem, uint64_t start, ++ uint64_t length, ++ struct kfd_ipc_obj *ipc_obj) + { +- struct kfd_process_device *pdd; ++ int handle; ++ struct kfd_bo *buf_obj; + struct kfd_process *p; +- unsigned int temp; +- int err = 0; + +- int idx = srcu_read_lock(&kfd_processes_srcu); ++ p = pdd->process; + +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- mutex_lock(&p->mutex); +- pdd = kfd_get_process_device_data(dev, p); ++ buf_obj = kzalloc(sizeof(*buf_obj), GFP_KERNEL); + +- if (WARN_ON(!pdd) || pdd->bound != PDD_BOUND_SUSPENDED) { +- mutex_unlock(&p->mutex); +- continue; +- } ++ if (!buf_obj) ++ return -ENOMEM; + +- err = amd_iommu_bind_pasid(dev->pdev, p->pasid, +- p->lead_thread); +- if (err < 0) { +- pr_err("Unexpected pasid %d binding failure\n", +- p->pasid); +- mutex_unlock(&p->mutex); +- break; +- } ++ buf_obj->it.start = start; ++ buf_obj->it.last = start + length - 1; ++ interval_tree_insert(&buf_obj->it, &p->bo_interval_tree); + +- pdd->bound = PDD_BOUND; +- mutex_unlock(&p->mutex); ++ buf_obj->mem = mem; ++ buf_obj->dev = pdd->dev; ++ buf_obj->kfd_ipc_obj = ipc_obj; ++ ++ INIT_LIST_HEAD(&buf_obj->cb_data_head); ++ ++ idr_preload(GFP_KERNEL); ++ ++ handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, ++ GFP_NOWAIT); ++ ++ idr_preload_end(); ++ ++ if (handle < 0) ++ kfree(buf_obj); ++ ++ return handle; ++} ++ ++struct kfd_bo *kfd_process_device_find_bo(struct kfd_process_device *pdd, ++ int handle) ++{ ++ if (handle < 0) ++ return NULL; ++ ++ return (struct kfd_bo *)idr_find(&pdd->alloc_idr, handle); ++} ++ ++/* Translate specific handle from process local memory idr ++ * Assumes that the process lock is held. ++ */ ++void *kfd_process_device_translate_handle(struct kfd_process_device *pdd, ++ int handle) ++{ ++ struct kfd_bo *buf_obj; ++ ++ buf_obj = kfd_process_device_find_bo(pdd, handle); ++ ++ return buf_obj->mem; ++} ++ ++void *kfd_process_find_bo_from_interval(struct kfd_process *p, ++ uint64_t start_addr, ++ uint64_t last_addr) ++{ ++ struct interval_tree_node *it_node; ++ struct kfd_bo *buf_obj; ++ ++ it_node = interval_tree_iter_first(&p->bo_interval_tree, ++ start_addr, last_addr); ++ if (!it_node) { ++ pr_err("0x%llx-0x%llx does not relate to an existing buffer\n", ++ start_addr, last_addr); ++ return NULL; + } + +- srcu_read_unlock(&kfd_processes_srcu, idx); ++ if (interval_tree_iter_next(it_node, start_addr, last_addr)) { ++ pr_err("0x%llx-0x%llx spans more than a single BO\n", ++ start_addr, last_addr); ++ return NULL; ++ } + +- return err; ++ buf_obj = container_of(it_node, struct kfd_bo, it); ++ ++ return buf_obj; + } + +-/* +- * Mark currently bound processes as PDD_BOUND_SUSPENDED. These +- * processes will be restored to PDD_BOUND state in +- * kfd_bind_processes_to_device. ++/* Remove specific handle from process local memory idr ++ * Assumes that the process lock is held. + */ +-void kfd_unbind_processes_from_device(struct kfd_dev *dev) ++void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd, ++ int handle) + { +- struct kfd_process_device *pdd; ++ struct kfd_bo *buf_obj; + struct kfd_process *p; ++ ++ p = pdd->process; ++ ++ if (handle < 0) ++ return; ++ ++ buf_obj = kfd_process_device_find_bo(pdd, handle); ++ ++ if (buf_obj->kfd_ipc_obj) ++ ipc_obj_put(&buf_obj->kfd_ipc_obj); ++ ++ idr_remove(&pdd->alloc_idr, handle); ++ ++ interval_tree_remove(&buf_obj->it, &p->bo_interval_tree); ++ ++ kfree(buf_obj); ++} ++ ++/* This increments the process->ref counter. */ ++struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) ++{ ++ struct kfd_process *p, *ret_p = NULL; + unsigned int temp; + + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- mutex_lock(&p->mutex); +- pdd = kfd_get_process_device_data(dev, p); +- +- if (WARN_ON(!pdd)) { +- mutex_unlock(&p->mutex); +- continue; ++ if (p->pasid == pasid) { ++ kref_get(&p->ref); ++ ret_p = p; ++ break; + } +- +- if (pdd->bound == PDD_BOUND) +- pdd->bound = PDD_BOUND_SUSPENDED; +- mutex_unlock(&p->mutex); + } + + srcu_read_unlock(&kfd_processes_srcu, idx); ++ ++ return ret_p; + } + +-void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid) ++void kfd_suspend_all_processes(void) + { + struct kfd_process *p; +- struct kfd_process_device *pdd; +- +- /* +- * Look for the process that matches the pasid. If there is no such +- * process, we either released it in amdkfd's own notifier, or there +- * is a bug. Unfortunately, there is no way to tell... +- */ +- p = kfd_lookup_process_by_pasid(pasid); +- if (!p) +- return; ++ unsigned int temp; ++ int idx = srcu_read_lock(&kfd_processes_srcu); + +- pr_debug("Unbinding process %d from IOMMU\n", pasid); ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ cancel_delayed_work_sync(&p->eviction_work); ++ cancel_delayed_work_sync(&p->restore_work); ++ ++ if (kfd_process_evict_queues(p)) ++ pr_err("Failed to suspend process %d\n", p->pasid); ++ dma_fence_signal(p->ef); ++ dma_fence_put(p->ef); ++ p->ef = NULL; ++ } ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++} + +- mutex_lock(kfd_get_dbgmgr_mutex()); ++int kfd_resume_all_processes(void) ++{ ++ struct kfd_process *p; ++ unsigned int temp; ++ int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); + +- if (dev->dbgmgr && dev->dbgmgr->pasid == p->pasid) { +- if (!kfd_dbgmgr_unregister(dev->dbgmgr, p)) { +- kfd_dbgmgr_destroy(dev->dbgmgr); +- dev->dbgmgr = NULL; ++ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { ++ if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) { ++ pr_err("Restore process %d failed during resume\n", ++ p->pasid); ++ ret = -EFAULT; + } + } ++ srcu_read_unlock(&kfd_processes_srcu, idx); ++ return ret; ++} + +- mutex_unlock(kfd_get_dbgmgr_mutex()); ++/* This increments the process->ref counter. */ ++struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) ++{ ++ struct kfd_process *p; + +- mutex_lock(&p->mutex); ++ int idx = srcu_read_lock(&kfd_processes_srcu); + +- pdd = kfd_get_process_device_data(dev, p); +- if (pdd) +- /* For GPU relying on IOMMU, we need to dequeue here +- * when PASID is still bound. +- */ +- kfd_process_dequeue_from_device(pdd); ++ p = find_process_by_mm(mm); ++ if (p) ++ kref_get(&p->ref); + +- mutex_unlock(&p->mutex); ++ srcu_read_unlock(&kfd_processes_srcu, idx); + +- kfd_unref_process(p); ++ return p; + } + +-struct kfd_process_device *kfd_get_first_process_device_data( +- struct kfd_process *p) ++/* kfd_process_evict_queues - Evict all user queues of a process ++ * ++ * Eviction is reference-counted per process-device. This means multiple ++ * evictions from different sources can be nested safely. ++ */ ++int kfd_process_evict_queues(struct kfd_process *p) + { +- return list_first_entry(&p->per_device_data, +- struct kfd_process_device, +- per_device_list); ++ struct kfd_process_device *pdd; ++ int r = 0; ++ unsigned int n_evicted = 0; ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, ++ &pdd->qpd); ++ if (r) { ++ pr_err("Failed to evict process queues\n"); ++ goto fail; ++ } ++ n_evicted++; ++ } ++ ++ return r; ++ ++fail: ++ /* To keep state consistent, roll back partial eviction by ++ * restoring queues ++ */ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ if (n_evicted == 0) ++ break; ++ if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, ++ &pdd->qpd)) ++ pr_err("Failed to restore queues\n"); ++ ++ n_evicted--; ++ } ++ ++ return r; + } + +-struct kfd_process_device *kfd_get_next_process_device_data( +- struct kfd_process *p, +- struct kfd_process_device *pdd) ++/* kfd_process_restore_queues - Restore all user queues of a process */ ++int kfd_process_restore_queues(struct kfd_process *p) + { +- if (list_is_last(&pdd->per_device_list, &p->per_device_data)) +- return NULL; +- return list_next_entry(pdd, per_device_list); ++ struct kfd_process_device *pdd; ++ int r, ret = 0; ++ ++ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { ++ r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, ++ &pdd->qpd); ++ if (r) { ++ pr_err("Failed to restore process queues\n"); ++ if (!ret) ++ ret = r; ++ } ++ } ++ ++ return ret; + } + +-bool kfd_has_process_device_data(struct kfd_process *p) ++static void evict_process_worker(struct work_struct *work) + { +- return !(list_empty(&p->per_device_data)); ++ int ret; ++ struct kfd_process *p; ++ struct delayed_work *dwork; ++ ++ dwork = to_delayed_work(work); ++ ++ /* Process termination destroys this worker thread. So during the ++ * lifetime of this thread, kfd_process p will be valid ++ */ ++ p = container_of(dwork, struct kfd_process, eviction_work); ++ WARN_ONCE(p->last_eviction_seqno != p->ef->seqno, ++ "Eviction fence mismatch\n"); ++ ++ /* Narrow window of overlap between restore and evict work ++ * item is possible. Once ++ * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs, ++ * it is possible to evicted again. But restore has few more ++ * steps of finish. So lets wait for any previous restore work ++ * to complete ++ */ ++ flush_delayed_work(&p->restore_work); ++ ++ pr_info("Started evicting process of pasid %d\n", p->pasid); ++ ret = kfd_process_evict_queues(p); ++ if (!ret) { ++ dma_fence_signal(p->ef); ++ dma_fence_put(p->ef); ++ p->ef = NULL; ++ queue_delayed_work(kfd_restore_wq, &p->restore_work, ++ msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); ++ ++ pr_info("Finished evicting process of pasid %d\n", p->pasid); ++ } else ++ pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n", ++ p->pasid); + } + +-/* This increments the process->ref counter. */ +-struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) ++static void restore_process_worker(struct work_struct *work) + { +- struct kfd_process *p, *ret_p = NULL; +- unsigned int temp; ++ struct delayed_work *dwork; ++ struct kfd_process *p; ++ struct kfd_process_device *pdd; ++ int ret = 0; + +- int idx = srcu_read_lock(&kfd_processes_srcu); ++ dwork = to_delayed_work(work); + +- hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { +- if (p->pasid == pasid) { +- kref_get(&p->ref); +- ret_p = p; +- break; +- } ++ /* Process termination destroys this worker thread. So during the ++ * lifetime of this thread, kfd_process p will be valid ++ */ ++ p = container_of(dwork, struct kfd_process, restore_work); ++ ++ /* Call restore_process_bos on the first KGD device. This function ++ * takes care of restoring the whole process including other devices. ++ * Restore can fail if enough memory is not available. If so, ++ * reschedule again. ++ */ ++ pdd = list_first_entry(&p->per_device_data, ++ struct kfd_process_device, ++ per_device_list); ++ ++ pr_info("Started restoring process of pasid %d\n", p->pasid); ++ ++ /* Setting last_restore_timestamp before successful restoration. ++ * Otherwise this would have to be set by KGD (restore_process_bos) ++ * before KFD BOs are unreserved. If not, the process can be evicted ++ * again before the timestamp is set. ++ * If restore fails, the timestamp will be set again in the next ++ * attempt. This would mean that the minimum GPU quanta would be ++ * PROCESS_ACTIVE_TIME_MS - (time to execute the following two ++ * functions) ++ */ ++ ++ p->last_restore_timestamp = get_jiffies_64(); ++ ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); ++ if (ret) { ++ pr_info("Restore failed, try again after %d ms\n", ++ PROCESS_BACK_OFF_TIME_MS); ++ ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, ++ msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); ++ WARN(!ret, "reschedule restore work failed\n"); ++ return; + } + +- srcu_read_unlock(&kfd_processes_srcu, idx); ++ ret = kfd_process_restore_queues(p); ++ if (ret) ++ pr_err("Failed to resume user queues\n"); + +- return ret_p; ++ pr_info("Finished restoring process of pasid %d\n", p->pasid); + } + + int kfd_reserved_mem_mmap(struct kfd_process *process, +- struct vm_area_struct *vma) ++ struct vm_area_struct *vma) + { + struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); + struct kfd_process_device *pdd; +@@ -633,6 +1123,23 @@ int kfd_reserved_mem_mmap(struct kfd_process *process, + KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); + } + ++ ++void kfd_flush_tlb(struct kfd_process_device *pdd) ++{ ++ struct kfd_dev *dev = pdd->dev; ++ const struct kfd2kgd_calls *f2g = dev->kfd2kgd; ++ ++ if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { ++ /* Nothing to flush until a VMID is assigned, which ++ * only happens when the first queue is created. ++ */ ++ if (pdd->qpd.vmid) ++ f2g->invalidate_tlbs_vmid(dev->kgd, pdd->qpd.vmid); ++ } else { ++ f2g->invalidate_tlbs(dev->kgd, pdd->process->pasid); ++ } ++} ++ + #if defined(CONFIG_DEBUG_FS) + + int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) +@@ -651,7 +1158,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) + r = pqm_debugfs_mqds(m, &p->pqm); + mutex_unlock(&p->mutex); + +- if (r) ++ if (r != 0) + break; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +index 33cf119..0617eee 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +@@ -119,9 +119,6 @@ static int create_cp_queue(struct process_queue_manager *pqm, + /* Doorbell initialized in user space*/ + q_properties->doorbell_ptr = NULL; + +- q_properties->doorbell_off = +- kfd_queue_id_to_doorbell(dev, pqm->process, qid); +- + /* let DQM handle it*/ + q_properties->vmid = 0; + q_properties->queue_id = qid; +@@ -189,9 +186,9 @@ int pqm_create_queue(struct process_queue_manager *pqm, + + switch (type) { + case KFD_QUEUE_TYPE_SDMA: +- if (dev->dqm->queue_count >= +- CIK_SDMA_QUEUES_PER_ENGINE * CIK_SDMA_ENGINE_NUM) { +- pr_err("Over-subscription is not allowed for SDMA.\n"); ++ if (dev->dqm->sdma_queue_count ++ >= get_num_sdma_queues(dev->dqm)) { ++ pr_debug("Over-subscription is not allowed for SDMA\n"); + retval = -EPERM; + goto err_create_queue; + } +@@ -208,10 +205,11 @@ int pqm_create_queue(struct process_queue_manager *pqm, + + case KFD_QUEUE_TYPE_COMPUTE: + /* check if there is over subscription */ +- if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && ++ if ((dev->dqm->sched_policy == ++ KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || + (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { +- pr_err("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); ++ pr_debug("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); + retval = -EPERM; + goto err_create_queue; + } +@@ -248,6 +246,15 @@ int pqm_create_queue(struct process_queue_manager *pqm, + goto err_create_queue; + } + ++ if (q) ++ /* Return the doorbell offset within the doorbell page ++ * to the caller so it can be passed up to user mode ++ * (in bytes). ++ */ ++ properties->doorbell_off = ++ (q->properties.doorbell_off * sizeof(uint32_t)) & ++ (kfd_doorbell_process_slice(dev) - 1); ++ + pr_debug("PQM After DQM create queue\n"); + + list_add(&pqn->process_queue_list, &pqm->queues); +@@ -311,6 +318,8 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) + + if (pqn->q) { + dqm = pqn->q->device->dqm; ++ kfree(pqn->q->properties.cu_mask); ++ pqn->q->properties.cu_mask = NULL; + retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); + if (retval) { + pr_debug("Destroy queue failed, returned %d\n", retval); +@@ -356,6 +365,34 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, + return 0; + } + ++int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, ++ struct queue_properties *p) ++{ ++ int retval; ++ struct process_queue_node *pqn; ++ ++ pqn = get_queue_by_qid(pqm, qid); ++ if (!pqn) { ++ pr_debug("No queue %d exists for update operation\n", qid); ++ return -EFAULT; ++ } ++ ++ /* Free the old CU mask memory if it is already allocated, then ++ * allocate memory for the new CU mask. ++ */ ++ kfree(pqn->q->properties.cu_mask); ++ ++ pqn->q->properties.cu_mask_count = p->cu_mask_count; ++ pqn->q->properties.cu_mask = p->cu_mask; ++ ++ retval = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, ++ pqn->q); ++ if (retval != 0) ++ return retval; ++ ++ return 0; ++} ++ + struct kernel_queue *pqm_get_kernel_queue( + struct process_queue_manager *pqm, + unsigned int qid) +@@ -369,6 +406,28 @@ struct kernel_queue *pqm_get_kernel_queue( + return NULL; + } + ++int pqm_get_wave_state(struct process_queue_manager *pqm, ++ unsigned int qid, ++ void __user *ctl_stack, ++ u32 *ctl_stack_used_size, ++ u32 *save_area_used_size) ++{ ++ struct process_queue_node *pqn; ++ ++ pqn = get_queue_by_qid(pqm, qid); ++ if (!pqn) { ++ pr_debug("amdkfd: No queue %d exists for operation\n", ++ qid); ++ return -EFAULT; ++ } ++ ++ return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm, ++ pqn->q, ++ ctl_stack, ++ ctl_stack_used_size, ++ save_area_used_size); ++} ++ + #if defined(CONFIG_DEBUG_FS) + + int pqm_debugfs_mqds(struct seq_file *m, void *data) +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +new file mode 100644 +index 0000000..985855f +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +@@ -0,0 +1,296 @@ ++/* ++ * Copyright 2015 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include <linux/device.h> ++#include <linux/export.h> ++#include <linux/pid.h> ++#include <linux/err.h> ++#include <linux/slab.h> ++#include "amd_rdma.h" ++#include "kfd_priv.h" ++ ++ ++struct rdma_cb { ++ struct list_head node; ++ struct amd_p2p_info amd_p2p_data; ++ void (*free_callback)(void *client_priv); ++ void *client_priv; ++}; ++ ++/** ++ * This function makes the pages underlying a range of GPU virtual memory ++ * accessible for DMA operations from another PCIe device ++ * ++ * \param address - The start address in the Unified Virtual Address ++ * space in the specified process ++ * \param length - The length of requested mapping ++ * \param pid - Pointer to structure pid to which address belongs. ++ * Could be NULL for current process address space. ++ * \param p2p_data - On return: Pointer to structure describing ++ * underlying pages/locations ++ * \param free_callback - Pointer to callback which will be called when access ++ * to such memory must be stopped immediately: Memory ++ * was freed, GECC events, etc. ++ * Client should immediately stop any transfer ++ * operations and returned as soon as possible. ++ * After return all resources associated with address ++ * will be release and no access will be allowed. ++ * \param client_priv - Pointer to be passed as parameter on ++ * 'free_callback; ++ * ++ * \return 0 if operation was successful ++ */ ++static int get_pages(uint64_t address, uint64_t length, struct pid *pid, ++ struct amd_p2p_info **amd_p2p_data, ++ void (*free_callback)(void *client_priv), ++ void *client_priv) ++{ ++ struct kfd_bo *buf_obj; ++ struct kgd_mem *mem; ++ struct sg_table *sg_table_tmp; ++ struct kfd_dev *dev; ++ uint64_t last = address + length - 1; ++ uint64_t offset; ++ struct kfd_process *p; ++ struct rdma_cb *rdma_cb_data; ++ int ret = 0; ++ ++ p = kfd_lookup_process_by_pid(pid); ++ if (!p) { ++ pr_err("Could not find the process\n"); ++ return -EINVAL; ++ } ++ mutex_lock(&p->mutex); ++ ++ buf_obj = kfd_process_find_bo_from_interval(p, address, last); ++ if (!buf_obj) { ++ pr_err("Cannot find a kfd_bo for the range\n"); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ rdma_cb_data = kmalloc(sizeof(*rdma_cb_data), GFP_KERNEL); ++ if (!rdma_cb_data) { ++ *amd_p2p_data = NULL; ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ mem = buf_obj->mem; ++ dev = buf_obj->dev; ++ offset = address - buf_obj->it.start; ++ ++ ret = dev->kfd2kgd->pin_get_sg_table_bo(dev->kgd, mem, ++ offset, length, &sg_table_tmp); ++ ++ if (ret) { ++ pr_err("pin_get_sg_table_bo failed.\n"); ++ *amd_p2p_data = NULL; ++ goto free_mem; ++ } ++ ++ rdma_cb_data->amd_p2p_data.va = address; ++ rdma_cb_data->amd_p2p_data.size = length; ++ rdma_cb_data->amd_p2p_data.pid = pid; ++ rdma_cb_data->amd_p2p_data.priv = buf_obj; ++ rdma_cb_data->amd_p2p_data.pages = sg_table_tmp; ++ ++ rdma_cb_data->free_callback = free_callback; ++ rdma_cb_data->client_priv = client_priv; ++ ++ list_add(&rdma_cb_data->node, &buf_obj->cb_data_head); ++ ++ *amd_p2p_data = &rdma_cb_data->amd_p2p_data; ++ ++ goto out; ++ ++free_mem: ++ kfree(rdma_cb_data); ++out: ++ mutex_unlock(&p->mutex); ++ kfd_unref_process(p); ++ ++ return ret; ++} ++ ++static int put_pages_helper(struct amd_p2p_info *p2p_data) ++{ ++ struct kfd_bo *buf_obj; ++ struct kfd_dev *dev; ++ struct sg_table *sg_table_tmp; ++ struct rdma_cb *rdma_cb_data; ++ ++ if (!p2p_data) { ++ pr_err("amd_p2p_info pointer is invalid.\n"); ++ return -EINVAL; ++ } ++ ++ rdma_cb_data = container_of(p2p_data, struct rdma_cb, amd_p2p_data); ++ ++ buf_obj = p2p_data->priv; ++ dev = buf_obj->dev; ++ sg_table_tmp = p2p_data->pages; ++ ++ list_del(&rdma_cb_data->node); ++ kfree(rdma_cb_data); ++ ++ dev->kfd2kgd->unpin_put_sg_table_bo(buf_obj->mem, sg_table_tmp); ++ ++ ++ return 0; ++} ++ ++void run_rdma_free_callback(struct kfd_bo *buf_obj) ++{ ++ struct rdma_cb *tmp, *rdma_cb_data; ++ ++ list_for_each_entry_safe(rdma_cb_data, tmp, ++ &buf_obj->cb_data_head, node) { ++ if (rdma_cb_data->free_callback) ++ rdma_cb_data->free_callback( ++ rdma_cb_data->client_priv); ++ } ++ list_for_each_entry_safe(rdma_cb_data, tmp, ++ &buf_obj->cb_data_head, node) ++ put_pages_helper(&rdma_cb_data->amd_p2p_data); ++} ++ ++/** ++ * ++ * This function release resources previously allocated by get_pages() call. ++ * ++ * \param p_p2p_data - A pointer to pointer to amd_p2p_info entries ++ * allocated by get_pages() call. ++ * ++ * \return 0 if operation was successful ++ */ ++static int put_pages(struct amd_p2p_info **p_p2p_data) ++{ ++ struct kfd_process *p = NULL; ++ int ret = 0; ++ ++ if (!(*p_p2p_data)) { ++ pr_err("amd_p2p_info pointer is invalid.\n"); ++ return -EINVAL; ++ } ++ ++ p = kfd_lookup_process_by_pid((*p_p2p_data)->pid); ++ if (!p) { ++ pr_err("Could not find the process\n"); ++ return -EINVAL; ++ } ++ ++ ret = put_pages_helper(*p_p2p_data); ++ ++ if (!ret) ++ *p_p2p_data = NULL; ++ ++ kfd_unref_process(p); ++ ++ return ret; ++} ++ ++/** ++ * Check if given address belongs to GPU address space. ++ * ++ * \param address - Address to check ++ * \param pid - Process to which given address belongs. ++ * Could be NULL if current one. ++ * ++ * \return 0 - This is not GPU address managed by AMD driver ++ * 1 - This is GPU address managed by AMD driver ++ */ ++static int is_gpu_address(uint64_t address, struct pid *pid) ++{ ++ struct kfd_bo *buf_obj; ++ struct kfd_process *p; ++ ++ p = kfd_lookup_process_by_pid(pid); ++ if (!p) { ++ pr_debug("Could not find the process\n"); ++ return 0; ++ } ++ ++ buf_obj = kfd_process_find_bo_from_interval(p, address, address); ++ ++ kfd_unref_process(p); ++ if (!buf_obj) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * Return the single page size to be used when building scatter/gather table ++ * for given range. ++ * ++ * \param address - Address ++ * \param length - Range length ++ * \param pid - Process id structure. Could be NULL if current one. ++ * \param page_size - On return: Page size ++ * ++ * \return 0 if operation was successful ++ */ ++static int get_page_size(uint64_t address, uint64_t length, struct pid *pid, ++ unsigned long *page_size) ++{ ++ /* ++ * As local memory is always consecutive, we can assume the local ++ * memory page size to be arbitrary. ++ * Currently we assume the local memory page size to be the same ++ * as system memory, which is 4KB. ++ */ ++ *page_size = PAGE_SIZE; ++ ++ return 0; ++} ++ ++ ++/** ++ * Singleton object: rdma interface function pointers ++ */ ++static const struct amd_rdma_interface rdma_ops = { ++ .get_pages = get_pages, ++ .put_pages = put_pages, ++ .is_gpu_address = is_gpu_address, ++ .get_page_size = get_page_size, ++}; ++ ++/** ++ * amdkfd_query_rdma_interface - Return interface (function pointers table) for ++ * rdma interface ++ * ++ * ++ * \param interace - OUT: Pointer to interface ++ * ++ * \return 0 if operation was successful. ++ */ ++int amdkfd_query_rdma_interface(const struct amd_rdma_interface **ops) ++{ ++ *ops = &rdma_ops; ++ ++ return 0; ++} ++EXPORT_SYMBOL(amdkfd_query_rdma_interface); ++ ++ ++ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index c6a7609..47bcca0 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -35,6 +35,7 @@ + #include "kfd_crat.h" + #include "kfd_topology.h" + #include "kfd_device_queue_manager.h" ++#include "kfd_iommu.h" + + /* topology_device_list - Master list of all topology devices */ + static struct list_head topology_device_list; +@@ -88,7 +89,25 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) + down_read(&topology_lock); + + list_for_each_entry(top_dev, &topology_device_list, list) +- if (top_dev->gpu->pdev == pdev) { ++ if (top_dev->gpu && top_dev->gpu->pdev == pdev) { ++ device = top_dev->gpu; ++ break; ++ } ++ ++ up_read(&topology_lock); ++ ++ return device; ++} ++ ++struct kfd_dev *kfd_device_by_kgd(const struct kgd_dev *kgd) ++{ ++ struct kfd_topology_device *top_dev; ++ struct kfd_dev *device = NULL; ++ ++ down_read(&topology_lock); ++ ++ list_for_each_entry(top_dev, &topology_device_list, list) ++ if (top_dev->gpu && top_dev->gpu->kgd == kgd) { + device = top_dev->gpu; + break; + } +@@ -177,7 +196,6 @@ struct kfd_topology_device *kfd_create_topology_device( + return dev; + } + +- + #define sysfs_show_gen_prop(buffer, fmt, ...) \ + snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) + #define sysfs_show_32bit_prop(buffer, name, value) \ +@@ -186,6 +204,8 @@ struct kfd_topology_device *kfd_create_topology_device( + sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) + #define sysfs_show_32bit_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%u\n", value) ++#define sysfs_show_64bit_val(buffer, value) \ ++ sysfs_show_gen_prop(buffer, "%llu\n", value) + #define sysfs_show_str_val(buffer, value) \ + sysfs_show_gen_prop(buffer, "%s\n", value) + +@@ -268,11 +288,23 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, + { + ssize_t ret; + struct kfd_mem_properties *mem; ++ uint64_t used_mem; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; + +- mem = container_of(attr, struct kfd_mem_properties, attr); ++ if (strcmp(attr->name, "used_memory") == 0) { ++ mem = container_of(attr, struct kfd_mem_properties, ++ attr_used); ++ if (mem->gpu) { ++ used_mem = mem->gpu->kfd2kgd->get_vram_usage(mem->gpu->kgd); ++ return sysfs_show_64bit_val(buffer, used_mem); ++ } ++ /* TODO: Report APU/CPU-allocated memory; For now return 0 */ ++ return 0; ++ } ++ ++ mem = container_of(attr, struct kfd_mem_properties, attr_props); + sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, "flags", mem->flags); +@@ -377,6 +409,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t i; + uint32_t log_max_watch_addr; ++ struct kfd_local_mem_info local_mem_info; + + /* Making sure that the buffer is an empty string */ + buffer[0] = 0; +@@ -440,6 +473,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, "location_id", + dev->node_props.location_id); ++ sysfs_show_32bit_prop(buffer, "drm_render_minor", ++ dev->node_props.drm_render_minor); + + if (dev->gpu) { + log_max_watch_addr = +@@ -462,13 +497,22 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + dev->node_props.max_engine_clk_fcompute); + +- sysfs_show_64bit_prop(buffer, "local_mem_size", +- (unsigned long long int) 0); ++ /* ++ * If the ASIC is APU except Kaveri, set local memory size ++ * to 0 to disable local memory support ++ */ ++ if (!dev->gpu->device_info->needs_iommu_device ++ || dev->gpu->device_info->asic_family == CHIP_KAVERI) { ++ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, ++ &local_mem_info); ++ sysfs_show_64bit_prop(buffer, "local_mem_size", ++ local_mem_info.local_mem_size_private + ++ local_mem_info.local_mem_size_public); ++ } else ++ sysfs_show_64bit_prop(buffer, "local_mem_size", 0ULL); + + sysfs_show_32bit_prop(buffer, "fw_version", +- dev->gpu->kfd2kgd->get_fw_version( +- dev->gpu->kgd, +- KGD_ENGINE_MEC1)); ++ dev->gpu->mec_fw_version); + sysfs_show_32bit_prop(buffer, "capability", + dev->node_props.capability); + } +@@ -527,7 +571,12 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev) + if (dev->kobj_mem) { + list_for_each_entry(mem, &dev->mem_props, list) + if (mem->kobj) { +- kfd_remove_sysfs_file(mem->kobj, &mem->attr); ++ /* TODO: Remove when CPU/APU supported */ ++ if (dev->node_props.cpu_cores_count == 0) ++ sysfs_remove_file(mem->kobj, ++ &mem->attr_used); ++ kfd_remove_sysfs_file(mem->kobj, ++ &mem->attr_props); + mem->kobj = NULL; + } + kobject_del(dev->kobj_mem); +@@ -629,12 +678,23 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + if (ret < 0) + return ret; + +- mem->attr.name = "properties"; +- mem->attr.mode = KFD_SYSFS_FILE_MODE; +- sysfs_attr_init(&mem->attr); +- ret = sysfs_create_file(mem->kobj, &mem->attr); ++ mem->attr_props.name = "properties"; ++ mem->attr_props.mode = KFD_SYSFS_FILE_MODE; ++ sysfs_attr_init(&mem->attr_props); ++ ret = sysfs_create_file(mem->kobj, &mem->attr_props); + if (ret < 0) + return ret; ++ ++ /* TODO: Support APU/CPU memory usage */ ++ if (dev->node_props.cpu_cores_count == 0) { ++ mem->attr_used.name = "used_memory"; ++ mem->attr_used.mode = KFD_SYSFS_FILE_MODE; ++ sysfs_attr_init(&mem->attr_used); ++ ret = sysfs_create_file(mem->kobj, &mem->attr_used); ++ if (ret < 0) ++ return ret; ++ } ++ + i++; + } + +@@ -828,8 +888,7 @@ static void kfd_debug_print_topology(void) + up_read(&topology_lock); + } + +-/* Helper function for intializing platform_xx members of +- * kfd_system_properties. Uses OEM info from the last CPU/APU node. ++/* Helper function for intializing platform_xx members of kfd_system_properties + */ + static void kfd_update_system_properties(void) + { +@@ -875,19 +934,8 @@ static void find_system_memory(const struct dmi_header *dm, + */ + static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev) + { +- struct kfd_perf_properties *props; +- +- if (amd_iommu_pc_supported()) { +- props = kfd_alloc_struct(props); +- if (!props) +- return -ENOMEM; +- strcpy(props->block_name, "iommu"); +- props->max_concurrent = amd_iommu_pc_get_max_banks(0) * +- amd_iommu_pc_get_max_counters(0); /* assume one iommu */ +- list_add_tail(&props->list, &kdev->perf_props); +- } +- +- return 0; ++ /* These are the only counters supported so far */ ++ return kfd_iommu_add_perf_counters(kdev); + } + + /* kfd_add_non_crat_information - Add information that is not currently +@@ -904,6 +952,7 @@ static void kfd_add_non_crat_information(struct kfd_topology_device *kdev) + /* TODO: For GPU node, rearrange code from kfd_topology_add_device */ + } + ++#ifdef CONFIG_ACPI + /* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices. + * Ignore CRAT for all other devices. AMD APU is identified if both CPU + * and GPU cores are present. +@@ -922,6 +971,7 @@ static bool kfd_is_acpi_crat_invalid(struct list_head *device_list) + pr_info("Ignoring ACPI CRAT on non-APU system\n"); + return true; + } ++#endif + + int kfd_topology_init(void) + { +@@ -959,24 +1009,26 @@ int kfd_topology_init(void) + * NOTE: The current implementation expects all AMD APUs to have + * CRAT. If no CRAT is available, it is assumed to be a CPU + */ ++#ifdef CONFIG_ACPI + ret = kfd_create_crat_image_acpi(&crat_image, &image_size); +- if (!ret) { ++ if (ret == 0) { + ret = kfd_parse_crat_table(crat_image, + &temp_topology_device_list, + proximity_domain); + if (ret || +- kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { ++ kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { ++ + kfd_release_topology_device_list( + &temp_topology_device_list); + kfd_destroy_crat_image(crat_image); + crat_image = NULL; + } + } +- ++#endif + if (!crat_image) { + ret = kfd_create_crat_image_virtual(&crat_image, &image_size, +- COMPUTE_UNIT_CPU, NULL, +- proximity_domain); ++ COMPUTE_UNIT_CPU, NULL, ++ proximity_domain); + cpu_only_node = 1; + if (ret) { + pr_err("Error creating VCRAT table for CPU\n"); +@@ -984,26 +1036,25 @@ int kfd_topology_init(void) + } + + ret = kfd_parse_crat_table(crat_image, +- &temp_topology_device_list, +- proximity_domain); ++ &temp_topology_device_list, ++ proximity_domain); + if (ret) { + pr_err("Error parsing VCRAT table for CPU\n"); + goto err; + } + } +- + kdev = list_first_entry(&temp_topology_device_list, + struct kfd_topology_device, list); + kfd_add_perf_to_topology(kdev); + + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, +- &topology_device_list); ++ &topology_device_list); + atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); + ret = kfd_topology_update_sysfs(); + up_write(&topology_lock); + +- if (!ret) { ++ if (ret == 0) { + sys_props.generation_count++; + kfd_update_system_properties(); + kfd_debug_print_topology(); +@@ -1076,15 +1127,22 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) + { + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; ++ struct kfd_mem_properties *mem; + + down_write(&topology_lock); + list_for_each_entry(dev, &topology_device_list, list) + if (!dev->gpu && (dev->node_props.simd_count > 0)) { + dev->gpu = gpu; + out_dev = dev; ++ ++ /* Assign mem->gpu */ ++ list_for_each_entry(mem, &dev->mem_props, list) ++ mem->gpu = dev->gpu; ++ + break; + } + up_write(&topology_lock); ++ + return out_dev; + } + +@@ -1152,7 +1210,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + + pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + +- proximity_domain = atomic_inc_return(&topology_crat_proximity_domain); ++ proximity_domain = atomic_inc_return(& ++ topology_crat_proximity_domain); + + /* Check to see if this gpu device exists in the topology_device_list. + * If so, assign the gpu to that device, +@@ -1163,16 +1222,15 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + dev = kfd_assign_gpu(gpu); + if (!dev) { + res = kfd_create_crat_image_virtual(&crat_image, &image_size, +- COMPUTE_UNIT_GPU, gpu, +- proximity_domain); ++ COMPUTE_UNIT_GPU, ++ gpu, proximity_domain); + if (res) { + pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", + gpu_id); + return res; + } + res = kfd_parse_crat_table(crat_image, +- &temp_topology_device_list, +- proximity_domain); ++ &temp_topology_device_list, proximity_domain); + if (res) { + pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", + gpu_id); +@@ -1189,13 +1247,14 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + res = kfd_topology_update_sysfs(); + up_write(&topology_lock); + +- if (!res) ++ if (res == 0) + sys_props.generation_count++; + else + pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", + gpu_id, res); + dev = kfd_assign_gpu(gpu); +- if (WARN_ON(!dev)) { ++ if (!dev) { ++ pr_err("Could not assign GPU\n"); + res = -ENODEV; + goto err; + } +@@ -1224,6 +1283,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); + dev->node_props.max_engine_clk_ccompute = + cpufreq_quick_get_max(0) / 1000; ++ dev->node_props.drm_render_minor = ++ gpu->shared_resources.drm_render_minor; + + kfd_fill_mem_clk_max_info(dev); + kfd_fill_iolink_non_crat_info(dev); +@@ -1245,16 +1306,21 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & + HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); + break; ++ case CHIP_VEGA10: ++ case CHIP_RAVEN: ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; + default: +- WARN(1, "Unexpected ASIC family %u", +- dev->gpu->device_info->asic_family); ++ BUG(); + } + + /* Fix errors in CZ CRAT. +- * simd_count: Carrizo CRAT reports wrong simd_count, probably +- * because it doesn't consider masked out CUs +- * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd +- * capability flag: Carrizo CRAT doesn't report IOMMU flags ++ * simd_count: Carrizo CRAT reports wrong simd_count, probably because ++ * it doesn't consider masked out CUs ++ * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. ++ * capability flag: Carrizo CRAT doesn't report IOMMU flags. + */ + if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { + dev->node_props.simd_count = +@@ -1294,7 +1360,7 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) + + up_write(&topology_lock); + +- if (!res) ++ if (res == 0) + kfd_notify_gpu_change(gpu_id, 0); + + return res; +@@ -1333,17 +1399,18 @@ int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev) + + static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) + { +- const struct cpuinfo_x86 *cpuinfo; + int first_cpu_of_numa_node; + +- if (!cpumask || cpumask == cpu_none_mask) ++ if (!cpumask || (cpumask == cpu_none_mask)) + return -1; + first_cpu_of_numa_node = cpumask_first(cpumask); + if (first_cpu_of_numa_node >= nr_cpu_ids) + return -1; +- cpuinfo = &cpu_data(first_cpu_of_numa_node); +- +- return cpuinfo->apicid; ++#ifdef CONFIG_X86_64 ++ return cpu_data(first_cpu_of_numa_node).apicid; ++#else ++ return first_cpu_of_numa_node; ++#endif + } + + /* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor +@@ -1377,7 +1444,7 @@ int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = dqm_debugfs_hqds(m, dev->gpu->dqm); +- if (r) ++ if (r != 0) + break; + } + +@@ -1402,7 +1469,7 @@ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) + + seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); + r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); +- if (r) ++ if (r != 0) + break; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index 53fca1f..f4d29c4 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -25,7 +25,7 @@ + + #include <linux/types.h> + #include <linux/list.h> +-#include "kfd_priv.h" ++#include "kfd_crat.h" + + #define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128 + +@@ -45,6 +45,10 @@ + + #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 + #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 ++#define HSA_CAP_DOORBELL_TYPE_2_0 0x2 ++#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 ++#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 ++#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 + #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 + + struct kfd_node_properties { +@@ -71,6 +75,7 @@ struct kfd_node_properties { + uint32_t location_id; + uint32_t max_engine_clk_fcompute; + uint32_t max_engine_clk_ccompute; ++ int32_t drm_render_minor; + uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + }; + +@@ -93,7 +98,9 @@ struct kfd_mem_properties { + uint32_t width; + uint32_t mem_clk_max; + struct kobject *kobj; +- struct attribute attr; ++ struct kfd_dev *gpu; ++ struct attribute attr_props; ++ struct attribute attr_used; + }; + + #define HSA_CACHE_TYPE_DATA 0x00000001 +@@ -162,9 +169,9 @@ struct kfd_topology_device { + struct attribute attr_gpuid; + struct attribute attr_name; + struct attribute attr_props; +- uint8_t oem_id[CRAT_OEMID_LENGTH]; +- uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; +- uint32_t oem_revision; ++ uint8_t oem_id[CRAT_OEMID_LENGTH]; ++ uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; ++ uint32_t oem_revision; + }; + + struct kfd_system_properties { +@@ -183,8 +190,4 @@ struct kfd_topology_device *kfd_create_topology_device( + struct list_head *device_list); + void kfd_release_topology_device_list(struct list_head *device_list); + +-extern bool amd_iommu_pc_supported(void); +-extern u8 amd_iommu_pc_get_max_banks(u16 devid); +-extern u8 amd_iommu_pc_get_max_counters(u16 devid); +- + #endif /* __KFD_TOPOLOGY_H__ */ +diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h +new file mode 100644 +index 0000000..e00d03d +--- /dev/null ++++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h +@@ -0,0 +1,84 @@ ++/* ++ * Copyright 2016 Advanced Micro Devices, Inc. ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR ++ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ++ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR ++ * OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef HSA_SOC15_INT_H_INCLUDED ++#define HSA_SOC15_INT_H_INCLUDED ++/* ++ * vega10+ IH clients ++ */ ++enum soc15_ih_client_id { ++ SOC15_IH_CLIENTID_IH = 0x00, ++ SOC15_IH_CLIENTID_ACP = 0x01, ++ SOC15_IH_CLIENTID_ATHUB = 0x02, ++ SOC15_IH_CLIENTID_BIF = 0x03, ++ SOC15_IH_CLIENTID_DCE = 0x04, ++ SOC15_IH_CLIENTID_ISP = 0x05, ++ SOC15_IH_CLIENTID_PCIE0 = 0x06, ++ SOC15_IH_CLIENTID_RLC = 0x07, ++ SOC15_IH_CLIENTID_SDMA0 = 0x08, ++ SOC15_IH_CLIENTID_SDMA1 = 0x09, ++ SOC15_IH_CLIENTID_SE0SH = 0x0a, ++ SOC15_IH_CLIENTID_SE1SH = 0x0b, ++ SOC15_IH_CLIENTID_SE2SH = 0x0c, ++ SOC15_IH_CLIENTID_SE3SH = 0x0d, ++ SOC15_IH_CLIENTID_SYSHUB = 0x0e, ++ SOC15_IH_CLIENTID_THM = 0x0f, ++ SOC15_IH_CLIENTID_UVD = 0x10, ++ SOC15_IH_CLIENTID_VCE0 = 0x11, ++ SOC15_IH_CLIENTID_VMC = 0x12, ++ SOC15_IH_CLIENTID_XDMA = 0x13, ++ SOC15_IH_CLIENTID_GRBM_CP = 0x14, ++ SOC15_IH_CLIENTID_ATS = 0x15, ++ SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, ++ SOC15_IH_CLIENTID_DF = 0x17, ++ SOC15_IH_CLIENTID_VCE1 = 0x18, ++ SOC15_IH_CLIENTID_PWR = 0x19, ++ SOC15_IH_CLIENTID_UTCL2 = 0x1b, ++ SOC15_IH_CLIENTID_EA = 0x1c, ++ SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, ++ SOC15_IH_CLIENTID_MP0 = 0x1e, ++ SOC15_IH_CLIENTID_MP1 = 0x1f, ++ ++ SOC15_IH_CLIENTID_MAX ++}; ++ ++ ++#define SOC15_INTSRC_CP_END_OF_PIPE 181 ++#define SOC15_INTSRC_CP_BAD_OPCODE 183 ++#define SOC15_INTSRC_SQ_INTERRUPT_MSG 239 ++#define SOC15_INTSRC_VMC_FAULT 0 ++#define SOC15_INTSRC_SDMA_TRAP 224 ++ ++ ++#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff) ++#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff) ++#define SOC15_RING_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 16 & 0xff) ++#define SOC15_VMID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 24 & 0xf) ++#define SOC15_VMID_TYPE_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 31 & 0x1) ++#define SOC15_PASID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[3]) & 0xffff) ++#define SOC15_CONTEXT_ID0_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[4])) ++#define SOC15_CONTEXT_ID1_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[5])) ++#define SOC15_CONTEXT_ID2_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[6])) ++#define SOC15_CONTEXT_ID3_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[7])) ++ ++#endif ++ +-- +2.7.4 + |