From f00599ff354b3f061df8ce41217562f7c1bfcc2d Mon Sep 17 00:00:00 2001 From: Chaudhary Amit Kumar Date: Wed, 9 Jan 2019 21:21:38 +0530 Subject: [PATCH 5618/5725] drm/amdkfd: Roll back all q4 amdkfd patches added by Kalyan. Signed-off-by: Ravi Kumar Signed-off-by: Chaudhary Amit Kumar --- drivers/gpu/drm/amd/amdkfd/Makefile | 4 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 78 +- drivers/gpu/drm/amd/amdkfd/cik_int.h | 25 +- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 568 ---------- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | 298 +++++- .../gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 439 +++++--- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1090 ++++++-------------- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 60 +- drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 48 +- drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c | 50 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 230 ++--- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 213 ++-- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 16 +- .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c | 6 +- .../drm/amd/amdkfd/kfd_device_queue_manager_vi.c | 29 +- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 22 +- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 129 +-- drivers/gpu/drm/amd/amdkfd/kfd_events.h | 1 - drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 61 +- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 81 +- drivers/gpu/drm/amd/amdkfd/kfd_iommu.c | 3 +- drivers/gpu/drm/amd/amdkfd/kfd_ipc.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 26 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c | 119 +++ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 78 +- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c | 180 ++-- drivers/gpu/drm/amd/amdkfd/kfd_module.c | 21 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 1 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h | 2 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c | 28 - drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c | 63 +- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c | 47 +- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 102 +- drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 178 ++-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 212 ++-- .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 26 +- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 8 +- drivers/gpu/drm/amd/amdkfd/kfd_rdma.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 94 +- drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 13 +- drivers/gpu/drm/amd/amdkfd/soc15_int.h | 41 +- 43 files changed, 1930 insertions(+), 2774 deletions(-) delete mode 100644 drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h mode change 100644 => 100755 drivers/gpu/drm/amd/amdkfd/kfd_device.c diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 4804f9c..b65537a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -24,7 +24,9 @@ # FULL_AMD_PATH=$(src)/.. -ccflags-y := -I$(FULL_AMD_PATH)/include \ + +ccflags-y := -Iinclude/drm \ + -I$(FULL_AMD_PATH)/include/ \ -I$(FULL_AMD_PATH)/include/asic_reg amdkfd-y := kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 5d2475d..751c004 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,20 @@ #include "kfd_events.h" #include "cik_int.h" +static bool is_cpc_vm_fault(struct kfd_dev *dev, + const uint32_t *ih_ring_entry) +{ + const struct cik_ih_ring_entry *ihre = + (const struct cik_ih_ring_entry *)ih_ring_entry; + + if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || + ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && + ihre->vmid >= dev->vm_info.first_vmid_kfd && + ihre->vmid <= dev->vm_info.last_vmid_kfd) + return true; + return false; +} + static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -32,7 +46,8 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - unsigned int vmid, pasid; + struct cik_ih_ring_entry *tmp_ihre = + (struct cik_ih_ring_entry *) patched_ihre; /* This workaround is due to HW/FW limitation on Hawaii that * VMID and PASID are not written into ih_ring_entry @@ -40,44 +55,23 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, if ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) && dev->device_info->asic_family == CHIP_HAWAII) { - struct cik_ih_ring_entry *tmp_ihre = - (struct cik_ih_ring_entry *)patched_ihre; - *patched_flag = true; *tmp_ihre = *ihre; - vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); - pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); - - tmp_ihre->ring_id &= 0x000000ff; - tmp_ihre->ring_id |= vmid << 8; - tmp_ihre->ring_id |= pasid << 16; - - return (pasid != 0) && - vmid >= dev->vm_info.first_vmid_kfd && - vmid <= dev->vm_info.last_vmid_kfd; + tmp_ihre->vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); + tmp_ihre->pasid = f2g->get_atc_vmid_pasid_mapping_pasid( + dev->kgd, tmp_ihre->vmid); + return (tmp_ihre->pasid != 0) && + tmp_ihre->vmid >= dev->vm_info.first_vmid_kfd && + tmp_ihre->vmid <= dev->vm_info.last_vmid_kfd; } - - /* Only handle interrupts from KFD VMIDs */ - vmid = (ihre->ring_id & 0x0000ff00) >> 8; - if (vmid < dev->vm_info.first_vmid_kfd || - vmid > dev->vm_info.last_vmid_kfd) - return 0; - - /* If there is no valid PASID, it's likely a firmware bug */ - pasid = (ihre->ring_id & 0xffff0000) >> 16; - if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) - return 0; - - /* Interrupt types we care about: various signals and faults. - * They will be forwarded to a work queue (see below). - */ - return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || + /* Do not process in ISR, just request it to be forwarded to WQ. */ + return (ihre->pasid != 0) && + (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE || ihre->source_id == CIK_INTSRC_SDMA_TRAP || ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG || ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE || - ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || - ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT; + is_cpc_vm_fault(dev, ih_ring_entry)); } static void cik_event_interrupt_wq(struct kfd_dev *dev, @@ -86,35 +80,33 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; uint32_t context_id = ihre->data & 0xfffffff; - unsigned int vmid = (ihre->ring_id & 0x0000ff00) >> 8; - unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16; - if (pasid == 0) + if (ihre->pasid == 0) return; if (ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE) - kfd_signal_event_interrupt(pasid, context_id, 28); + kfd_signal_event_interrupt(ihre->pasid, context_id, 28); else if (ihre->source_id == CIK_INTSRC_SDMA_TRAP) - kfd_signal_event_interrupt(pasid, context_id, 28); + kfd_signal_event_interrupt(ihre->pasid, context_id, 28); else if (ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG) - kfd_signal_event_interrupt(pasid, context_id & 0xff, 8); + kfd_signal_event_interrupt(ihre->pasid, context_id & 0xff, 8); else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(pasid); + kfd_signal_hw_exception_event(ihre->pasid); else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT || ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; - kfd_process_vm_fault(dev->dqm, pasid); + kfd_process_vm_fault(dev->dqm, ihre->pasid); memset(&info, 0, sizeof(info)); dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info); if (!info.page_addr && !info.status) return; - if (info.vmid == vmid) - kfd_signal_vm_fault_event(dev, pasid, &info); + if (info.vmid == ihre->vmid) + kfd_signal_vm_fault_event(dev, ihre->pasid, &info); else - kfd_signal_vm_fault_event(dev, pasid, NULL); + kfd_signal_vm_fault_event(dev, ihre->pasid, NULL); } } diff --git a/drivers/gpu/drm/amd/amdkfd/cik_int.h b/drivers/gpu/drm/amd/amdkfd/cik_int.h index a2079a0..ff8255d 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_int.h +++ b/drivers/gpu/drm/amd/amdkfd/cik_int.h @@ -26,19 +26,32 @@ #include struct cik_ih_ring_entry { - uint32_t source_id; - uint32_t data; - uint32_t ring_id; - uint32_t reserved; + uint32_t source_id:8; + uint32_t reserved1:8; + uint32_t reserved2:16; + + uint32_t data:28; + uint32_t reserved3:4; + + /* pipeid, meid and unused3 are officially called RINGID, + * but for our purposes, they always decode into pipe and ME. + */ + uint32_t pipeid:2; + uint32_t meid:2; + uint32_t reserved4:4; + uint32_t vmid:8; + uint32_t pasid:16; + + uint32_t reserved5; }; +#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 #define CIK_INTSRC_CP_END_OF_PIPE 0xB5 #define CIK_INTSRC_CP_BAD_OPCODE 0xB7 -#define CIK_INTSRC_DEQUEUE_COMPLETE 0xC6 -#define CIK_INTSRC_SDMA_TRAP 0xE0 #define CIK_INTSRC_SQ_INTERRUPT_MSG 0xEF #define CIK_INTSRC_GFX_PAGE_INV_FAULT 0x92 #define CIK_INTSRC_GFX_MEM_PROT_FAULT 0x93 +#define CIK_INTSRC_SDMA_TRAP 0xE0 #endif diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h deleted file mode 100644 index 3621efb..0000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ /dev/null @@ -1,568 +0,0 @@ -/* - * Copyright 2018 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf82012b, - 0xb8f4f802, 0x89748674, - 0xb8f5f803, 0x8675ff75, - 0x00000400, 0xbf850017, - 0xc00a1e37, 0x00000000, - 0xbf8c007f, 0x87777978, - 0xbf840005, 0x8f728374, - 0xb972e0c2, 0xbf800002, - 0xb9740002, 0xbe801d78, - 0xb8f5f803, 0x8675ff75, - 0x000001ff, 0xbf850002, - 0x80708470, 0x82718071, - 0x8671ff71, 0x0000ffff, - 0x8f728374, 0xb972e0c2, - 0xbf800002, 0xb9740002, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, - 0xb97a0283, 0xbef60068, - 0xbef70069, 0xb8fa1c07, - 0x8e7a9c7a, 0x87717a71, - 0xb8fa03c7, 0x8e7a9b7a, - 0x87717a71, 0xb8faf807, - 0x867aff7a, 0x00007fff, - 0xb97af807, 0xbef2007e, - 0xbef3007f, 0xbefe0180, - 0xbf900004, 0x877a8474, - 0xb97af802, 0xbf8e0002, - 0xbf88fffe, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x867aff7f, - 0x08000000, 0x8f7a837a, - 0x877b7a7b, 0x867aff7f, - 0x70000000, 0x8f7a817a, - 0x877b7a7b, 0xbeef007c, - 0xbeee0080, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611c7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cbc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611cfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611d3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xb8f5f803, - 0xbefe007c, 0xbefc006e, - 0xc0611d7c, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dbc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xbefe007c, 0xbefc006e, - 0xc0611dfc, 0x0000007c, - 0x806e846e, 0xbefc007e, - 0xb8eff801, 0xbefe007c, - 0xbefc006e, 0xc0611bfc, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b3c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0xbefe007c, - 0xbefc006e, 0xc0611b7c, - 0x0000007c, 0x806e846e, - 0xbefc007e, 0x867aff7f, - 0x04000000, 0xbef30080, - 0x8773737a, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8f51605, 0x80758175, - 0x8e758475, 0x8e7a8275, - 0xbefa00ff, 0x01000000, - 0xbef60178, 0x80786e78, - 0x82798079, 0xbefc0080, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003c, 0x00000000, - 0xc06b013c, 0x00000010, - 0xc06b023c, 0x00000020, - 0xc06b033c, 0x00000030, - 0x8078c078, 0x82798079, - 0x807c907c, 0xbf0a757c, - 0xbf85ffeb, 0xbef80176, - 0xbeee0080, 0xbefe00c1, - 0xbeff00c1, 0xbefa00ff, - 0x01000000, 0xe0724000, - 0x6e1e0000, 0xe0724100, - 0x6e1e0100, 0xe0724200, - 0x6e1e0200, 0xe0724300, - 0x6e1e0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f54306, - 0x8675c175, 0xbf84002c, - 0xbf8a0000, 0x867aff73, - 0x04000000, 0xbf840028, - 0x8e758675, 0x8e758275, - 0xbefa0075, 0xb8ee2a05, - 0x806e816e, 0x8e6e8a6e, - 0xb8fa1605, 0x807a817a, - 0x8e7a867a, 0x806e7a6e, - 0x806eff6e, 0x00000080, - 0xbefa00ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe80007b, - 0x867bff7b, 0xff7fffff, - 0x877bff7b, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8c007f, 0xe0765000, - 0x6e1e0002, 0x32040702, - 0xd0c9006a, 0x0000eb02, - 0xbf87fff7, 0xbefb0000, - 0xbeee00ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8f52a05, 0x80758175, - 0x8e758275, 0x8e7a8875, - 0xbefa00ff, 0x01000000, - 0xbefc0084, 0xbf0a757c, - 0xbf840015, 0xbf11017c, - 0x8075ff75, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x6e1e0000, - 0xe0724100, 0x6e1e0100, - 0xe0724200, 0x6e1e0200, - 0xe0724300, 0x6e1e0300, - 0x807c847c, 0x806eff6e, - 0x00000400, 0xbf0a757c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200cd, 0xbef8007e, - 0x8679ff7f, 0x0000ffff, - 0x8779ff79, 0x00040000, - 0xbefa0080, 0xbefb00ff, - 0x00807fac, 0x8676ff7f, - 0x08000000, 0x8f768376, - 0x877b767b, 0x8676ff7f, - 0x70000000, 0x8f768176, - 0x877b767b, 0x8676ff7f, - 0x04000000, 0xbf84001e, - 0xbefe00c1, 0xbeff00c1, - 0xb8f34306, 0x8673c173, - 0xbf840019, 0x8e738673, - 0x8e738273, 0xbefa0073, - 0xb8f22a05, 0x80728172, - 0x8e728a72, 0xb8f61605, - 0x80768176, 0x8e768676, - 0x80727672, 0x8072ff72, - 0x00000080, 0xbefa00ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x721e0000, - 0xe0510100, 0x721e0000, - 0x807cff7c, 0x00000200, - 0x8072ff72, 0x00000200, - 0xbf0a737c, 0xbf85fff6, - 0xbef20080, 0xbefe00c1, - 0xbeff00c1, 0xb8f32a05, - 0x80738173, 0x8e738273, - 0x8e7a8873, 0xbefa00ff, - 0x01000000, 0xbef60072, - 0x8072ff72, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x8073ff73, 0x00008000, - 0xe0524000, 0x721e0000, - 0xe0524100, 0x721e0100, - 0xe0524200, 0x721e0200, - 0xe0524300, 0x721e0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8072ff72, 0x00000400, - 0xbf0a737c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x761e0000, 0xe0524100, - 0x761e0100, 0xe0524200, - 0x761e0200, 0xe0524300, - 0x761e0300, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0x80f2c072, 0xb8f31605, - 0x80738173, 0x8e738473, - 0x8e7a8273, 0xbefa00ff, - 0x01000000, 0xbefc0073, - 0xc031003c, 0x00000072, - 0x80f2c072, 0xbf8c007f, - 0x80fc907c, 0xbe802d00, - 0xbe822d02, 0xbe842d04, - 0xbe862d06, 0xbe882d08, - 0xbe8a2d0a, 0xbe8c2d0c, - 0xbe8e2d0e, 0xbf06807c, - 0xbf84fff1, 0xb8f22a05, - 0x80728172, 0x8e728a72, - 0xb8f61605, 0x80768176, - 0x8e768676, 0x80727672, - 0xbefa0084, 0xbefa00ff, - 0x01000000, 0xc0211cfc, - 0x00000072, 0x80728472, - 0xc0211c3c, 0x00000072, - 0x80728472, 0xc0211c7c, - 0x00000072, 0x80728472, - 0xc0211bbc, 0x00000072, - 0x80728472, 0xc0211bfc, - 0x00000072, 0x80728472, - 0xc0211d3c, 0x00000072, - 0x80728472, 0xc0211d7c, - 0x00000072, 0x80728472, - 0xc0211a3c, 0x00000072, - 0x80728472, 0xc0211a7c, - 0x00000072, 0x80728472, - 0xc0211dfc, 0x00000072, - 0x80728472, 0xc0211b3c, - 0x00000072, 0x80728472, - 0xc0211b7c, 0x00000072, - 0x80728472, 0xbf8c007f, - 0xbefc0073, 0xbefe006e, - 0xbeff006f, 0x867375ff, - 0x000003ff, 0xb9734803, - 0x867375ff, 0xfffff800, - 0x8f738b73, 0xb973a2c3, - 0xb977f801, 0x8673ff71, - 0xf0000000, 0x8f739c73, - 0x8e739073, 0xbef60080, - 0x87767376, 0x8673ff71, - 0x08000000, 0x8f739b73, - 0x8e738f73, 0x87767376, - 0x8673ff74, 0x00800000, - 0x8f739773, 0xb976f807, - 0x8671ff71, 0x0000ffff, - 0x86fe7e7e, 0x86ea6a6a, - 0x8f768374, 0xb976e0c2, - 0xbf800002, 0xb9740002, - 0xbf8a0000, 0x95807370, - 0xbf810000, 0x00000000, -}; - - -static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf82015d, - 0xb8f8f802, 0x89788678, - 0xb8f1f803, 0x866eff71, - 0x00000400, 0xbf850037, - 0x866eff71, 0x00000800, - 0xbf850003, 0x866eff71, - 0x00000100, 0xbf840008, - 0x866eff78, 0x00002000, - 0xbf840001, 0xbf810000, - 0x8778ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xb8eef807, 0x866fff6e, - 0x001f8000, 0x8e6f8b6f, - 0x8977ff77, 0xfc000000, - 0x87776f77, 0x896eff6e, - 0x001f8000, 0xb96ef807, - 0xb8f0f812, 0xb8f1f813, - 0x8ef08870, 0xc0071bb8, - 0x00000000, 0xbf8cc07f, - 0xc0071c38, 0x00000008, - 0xbf8cc07f, 0x86ee6e6e, - 0xbf840001, 0xbe801d6e, - 0xb8f1f803, 0x8671ff71, - 0x000001ff, 0xbf850002, - 0x806c846c, 0x826d806d, - 0x866dff6d, 0x0000ffff, - 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, - 0x86fe7e7e, 0x86ea6a6a, - 0x8f6e8378, 0xb96ee0c2, - 0xbf800002, 0xb9780002, - 0xbe801f6c, 0x866dff6d, - 0x0000ffff, 0xbef00080, - 0xb9700283, 0xb8f02407, - 0x8e709c70, 0x876d706d, - 0xb8f003c7, 0x8e709b70, - 0x876d706d, 0xb8f0f807, - 0x8670ff70, 0x00007fff, - 0xb970f807, 0xbeee007e, - 0xbeef007f, 0xbefe0180, - 0xbf900004, 0x87708478, - 0xb970f802, 0xbf8e0002, - 0xbf88fffe, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0xb8f11605, 0x80718171, - 0x8e718671, 0x80707170, - 0x80707e70, 0x8271807f, - 0x8671ff71, 0x0000ffff, - 0xc0471cb8, 0x00000040, - 0xbf8cc07f, 0xc04b1d38, - 0x00000048, 0xbf8cc07f, - 0xc0431e78, 0x00000058, - 0xbf8cc07f, 0xc0471eb8, - 0x0000005c, 0xbf8cc07f, - 0xbef4007e, 0x8675ff7f, - 0x0000ffff, 0x8775ff75, - 0x00040000, 0xbef60080, - 0xbef700ff, 0x00807fac, - 0x8670ff7f, 0x08000000, - 0x8f708370, 0x87777077, - 0x8670ff7f, 0x70000000, - 0x8f708170, 0x87777077, - 0xbefb007c, 0xbefa0080, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f01605, - 0x80708170, 0x8e708670, - 0x807a707a, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xbefe007c, 0xbefc007a, - 0xc0611efa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b3a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611b7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bba, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611bfa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611e3a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8f1f803, 0xbefe007c, - 0xbefc007a, 0xc0611c7a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, - 0xc0611a3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, - 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611a7a, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8fbf801, 0xbefe007c, - 0xbefc007a, 0xc0611efa, - 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0x8670ff7f, 0x04000000, - 0xbeef0080, 0x876f6f70, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f11605, - 0x80718171, 0x8e718471, - 0x8e768271, 0xbef600ff, - 0x01000000, 0xbef20174, - 0x80747a74, 0x82758075, - 0xbefc0080, 0xbf800000, - 0xbe802b00, 0xbe822b02, - 0xbe842b04, 0xbe862b06, - 0xbe882b08, 0xbe8a2b0a, - 0xbe8c2b0c, 0xbe8e2b0e, - 0xc06b003a, 0x00000000, - 0xbf8cc07f, 0xc06b013a, - 0x00000010, 0xbf8cc07f, - 0xc06b023a, 0x00000020, - 0xbf8cc07f, 0xc06b033a, - 0x00000030, 0xbf8cc07f, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a717c, - 0xbf85ffe7, 0xbef40172, - 0xbefa0080, 0xbefe00c1, - 0xbeff00c1, 0xbee80080, - 0xbee90080, 0xbef600ff, - 0x01000000, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f14306, - 0x8671c171, 0xbf84002c, - 0xbf8a0000, 0x8670ff6f, - 0x04000000, 0xbf840028, - 0x8e718671, 0x8e718271, - 0xbef60071, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f01605, 0x80708170, - 0x8e708670, 0x807a707a, - 0x807aff7a, 0x00000080, - 0xbef600ff, 0x01000000, - 0xbefc0080, 0xd28c0002, - 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x7a1d0002, 0x68040702, - 0xd0c9006a, 0x0000e302, - 0xbf87fff7, 0xbef70000, - 0xbefa00ff, 0x00000400, - 0xbefe00c1, 0xbeff00c1, - 0xb8f12a05, 0x80718171, - 0x8e718271, 0x8e768871, - 0xbef600ff, 0x01000000, - 0xbefc0084, 0xbf0a717c, - 0xbf840015, 0xbf11017c, - 0x8071ff71, 0x00001000, - 0x7e000300, 0x7e020301, - 0x7e040302, 0x7e060303, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, - 0x807c847c, 0x807aff7a, - 0x00000400, 0xbf0a717c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200dc, 0xbef4007e, - 0x8675ff7f, 0x0000ffff, - 0x8775ff75, 0x00040000, - 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x08000000, 0x8f6e836e, - 0x87776e77, 0x866eff7f, - 0x70000000, 0x8f6e816e, - 0x87776e77, 0x866eff7f, - 0x04000000, 0xbf84001e, - 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf840019, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0x8078ff78, - 0x00000080, 0xbef600ff, - 0x01000000, 0xbefc0080, - 0xe0510000, 0x781d0000, - 0xe0510100, 0x781d0000, - 0x807cff7c, 0x00000200, - 0x8078ff78, 0x00000200, - 0xbf0a6f7c, 0xbf85fff6, - 0xbef80080, 0xbefe00c1, - 0xbeff00c1, 0xb8ef2a05, - 0x806f816f, 0x8e6f826f, - 0x8e76886f, 0xbef600ff, - 0x01000000, 0xbeee0078, - 0x8078ff78, 0x00000400, - 0xbefc0084, 0xbf11087c, - 0x806fff6f, 0x00008000, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, - 0x6e1d0000, 0xe0524100, - 0x6e1d0100, 0xe0524200, - 0x6e1d0200, 0xe0524300, - 0x6e1d0300, 0xb8f82a05, - 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x80f8c078, 0xb8ef1605, - 0x806f816f, 0x8e6f846f, - 0x8e76826f, 0xbef600ff, - 0x01000000, 0xbefc006f, - 0xc031003a, 0x00000078, - 0x80f8c078, 0xbf8cc07f, - 0x80fc907c, 0xbf800000, - 0xbe802d00, 0xbe822d02, - 0xbe842d04, 0xbe862d06, - 0xbe882d08, 0xbe8a2d0a, - 0xbe8c2d0c, 0xbe8e2d0e, - 0xbf06807c, 0xbf84fff0, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, - 0x806e816e, 0x8e6e866e, - 0x80786e78, 0xbef60084, - 0xbef600ff, 0x01000000, - 0xc0211bfa, 0x00000078, - 0x80788478, 0xc0211b3a, - 0x00000078, 0x80788478, - 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211eba, - 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, - 0x80788478, 0xc0211c3a, - 0x00000078, 0x80788478, - 0xc0211c7a, 0x00000078, - 0x80788478, 0xc0211a3a, - 0x00000078, 0x80788478, - 0xc0211a7a, 0x00000078, - 0x80788478, 0xc0211cfa, - 0x00000078, 0x80788478, - 0xbf8cc07f, 0xbefc006f, - 0xbefe007a, 0xbeff007b, - 0x866f71ff, 0x000003ff, - 0xb96f4803, 0x866f71ff, - 0xfffff800, 0x8f6f8b6f, - 0xb96fa2c3, 0xb973f801, - 0xb8ee2a05, 0x806e816e, - 0x8e6e8a6e, 0xb8ef1605, - 0x806f816f, 0x8e6f866f, - 0x806e6f6e, 0x806e746e, - 0x826f8075, 0x866fff6f, - 0x0000ffff, 0xc0071cb7, - 0x00000040, 0xc00b1d37, - 0x00000048, 0xc0031e77, - 0x00000058, 0xc0071eb7, - 0x0000005c, 0xbf8cc07f, - 0x866fff6d, 0xf0000000, - 0x8f6f9c6f, 0x8e6f906f, - 0xbeee0080, 0x876e6f6e, - 0x866fff6d, 0x08000000, - 0x8f6f9b6f, 0x8e6f8f6f, - 0x876e6f6e, 0x866fff70, - 0x00800000, 0x8f6f976f, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e8370, - 0xb96ee0c2, 0xbf800002, - 0xb9700002, 0xbf8a0000, - 0x95806f6c, 0xbf810000, -}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index abe1a5d..751cc2e 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -20,12 +20,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -/* To compile this assembly code: - * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex - */ - -/* HW (VI) source code for CWSR trap handler */ -/* Version 18 + multiple trap handler */ +#if 0 +HW (VI) source code for CWSR trap handler +#Version 18 + multiple trap handler // this performance-optimal version was originally from Seven Xu at SRDC @@ -77,7 +74,7 @@ var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_D /*************************************************************************/ /* control on how to run the shader */ /*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) var EMU_RUN_HACK = 0 var EMU_RUN_HACK_RESTORE_NORMAL = 0 var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 @@ -91,9 +88,9 @@ var WG_BASE_ADDR_HI = 0x0 var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem var CTX_SAVE_CONTROL = 0x0 var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing /**************************************************************************/ @@ -101,12 +98,7 @@ var SWIZZLE_EN = 0 //whether we use swi /**************************************************************************/ var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 -var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 -var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 -var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 -var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 @@ -157,7 +149,7 @@ var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi //tba_lo and tba_hi need to be saved/restored -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 @@ -255,7 +247,7 @@ if (!EMU_RUN_HACK) s_waitcnt lgkmcnt(0) s_or_b32 ttmp7, ttmp8, ttmp9 s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set - set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler L_NO_NEXT_TRAP: @@ -266,7 +258,7 @@ L_NO_NEXT_TRAP: s_addc_u32 ttmp1, ttmp1, 0 L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF - set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) s_rfe_b64 [ttmp0, ttmp1] end // ********* End handling of non-CWSR traps ******************* @@ -327,10 +319,6 @@ end s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC end - // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. - s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp - L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 @@ -1019,6 +1007,8 @@ end s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) @@ -1054,12 +1044,11 @@ end s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time if G8SR_DEBUG_TIMESTAMP s_memrealtime s_g8sr_ts_restore_d @@ -1139,10 +1128,257 @@ function get_hwreg_size_bytes return 128 //HWREG size 128 bytes end -function set_status_without_spi_prio(status, tmp) - // Do not restore STATUS.SPI_PRIO since scheduler may have raised it. - s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT - s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp - s_nop 0x2 // avoid S_SETREG => S_SETREG hazard - s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status -end + +#endif + +static const uint32_t cwsr_trap_gfx8_hex[] = { + 0xbf820001, 0xbf820123, + 0xb8f4f802, 0x89748674, + 0xb8f5f803, 0x8675ff75, + 0x00000400, 0xbf850011, + 0xc00a1e37, 0x00000000, + 0xbf8c007f, 0x87777978, + 0xbf840002, 0xb974f802, + 0xbe801d78, 0xb8f5f803, + 0x8675ff75, 0x000001ff, + 0xbf850002, 0x80708470, + 0x82718071, 0x8671ff71, + 0x0000ffff, 0xb974f802, + 0xbe801f70, 0xb8f5f803, + 0x8675ff75, 0x00000100, + 0xbf840006, 0xbefa0080, + 0xb97a0203, 0x8671ff71, + 0x0000ffff, 0x80f08870, + 0x82f18071, 0xbefa0080, + 0xb97a0283, 0xbef60068, + 0xbef70069, 0xb8fa1c07, + 0x8e7a9c7a, 0x87717a71, + 0xb8fa03c7, 0x8e7a9b7a, + 0x87717a71, 0xb8faf807, + 0x867aff7a, 0x00007fff, + 0xb97af807, 0xbef2007e, + 0xbef3007f, 0xbefe0180, + 0xbf900004, 0xbf8e0002, + 0xbf88fffe, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x877b7a7b, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x877b7a7b, 0xbeef007c, + 0xbeee0080, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611c7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cbc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611cfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611d3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xb8f5f803, + 0xbefe007c, 0xbefc006e, + 0xc0611d7c, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dbc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xbefe007c, 0xbefc006e, + 0xc0611dfc, 0x0000007c, + 0x806e846e, 0xbefc007e, + 0xb8eff801, 0xbefe007c, + 0xbefc006e, 0xc0611bfc, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b3c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0xbefe007c, + 0xbefc006e, 0xc0611b7c, + 0x0000007c, 0x806e846e, + 0xbefc007e, 0x867aff7f, + 0x04000000, 0xbef30080, + 0x8773737a, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8f51605, 0x80758175, + 0x8e758475, 0x8e7a8275, + 0xbefa00ff, 0x01000000, + 0xbef60178, 0x80786e78, + 0x82798079, 0xbefc0080, + 0xbe802b00, 0xbe822b02, + 0xbe842b04, 0xbe862b06, + 0xbe882b08, 0xbe8a2b0a, + 0xbe8c2b0c, 0xbe8e2b0e, + 0xc06b003c, 0x00000000, + 0xc06b013c, 0x00000010, + 0xc06b023c, 0x00000020, + 0xc06b033c, 0x00000030, + 0x8078c078, 0x82798079, + 0x807c907c, 0xbf0a757c, + 0xbf85ffeb, 0xbef80176, + 0xbeee0080, 0xbefe00c1, + 0xbeff00c1, 0xbefa00ff, + 0x01000000, 0xe0724000, + 0x6e1e0000, 0xe0724100, + 0x6e1e0100, 0xe0724200, + 0x6e1e0200, 0xe0724300, + 0x6e1e0300, 0xbefe00c1, + 0xbeff00c1, 0xb8f54306, + 0x8675c175, 0xbf84002c, + 0xbf8a0000, 0x867aff73, + 0x04000000, 0xbf840028, + 0x8e758675, 0x8e758275, + 0xbefa0075, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x806e7a6e, + 0x806eff6e, 0x00000080, + 0xbefa00ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0xd1060002, + 0x00011103, 0x7e0602ff, + 0x00000200, 0xbefc00ff, + 0x00010000, 0xbe80007b, + 0x867bff7b, 0xff7fffff, + 0x877bff7b, 0x00058000, + 0xd8ec0000, 0x00000002, + 0xbf8c007f, 0xe0765000, + 0x6e1e0002, 0x32040702, + 0xd0c9006a, 0x0000eb02, + 0xbf87fff7, 0xbefb0000, + 0xbeee00ff, 0x00000400, + 0xbefe00c1, 0xbeff00c1, + 0xb8f52a05, 0x80758175, + 0x8e758275, 0x8e7a8875, + 0xbefa00ff, 0x01000000, + 0xbefc0084, 0xbf0a757c, + 0xbf840015, 0xbf11017c, + 0x8075ff75, 0x00001000, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0xe0724000, 0x6e1e0000, + 0xe0724100, 0x6e1e0100, + 0xe0724200, 0x6e1e0200, + 0xe0724300, 0x6e1e0300, + 0x807c847c, 0x806eff6e, + 0x00000400, 0xbf0a757c, + 0xbf85ffef, 0xbf9c0000, + 0xbf8200ca, 0xbef8007e, + 0x8679ff7f, 0x0000ffff, + 0x8779ff79, 0x00040000, + 0xbefa0080, 0xbefb00ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x877b767b, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x877b767b, 0x8676ff7f, + 0x04000000, 0xbf84001e, + 0xbefe00c1, 0xbeff00c1, + 0xb8f34306, 0x8673c173, + 0xbf840019, 0x8e738673, + 0x8e738273, 0xbefa0073, + 0xb8f22a05, 0x80728172, + 0x8e728a72, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x80727672, 0x8072ff72, + 0x00000080, 0xbefa00ff, + 0x01000000, 0xbefc0080, + 0xe0510000, 0x721e0000, + 0xe0510100, 0x721e0000, + 0x807cff7c, 0x00000200, + 0x8072ff72, 0x00000200, + 0xbf0a737c, 0xbf85fff6, + 0xbef20080, 0xbefe00c1, + 0xbeff00c1, 0xb8f32a05, + 0x80738173, 0x8e738273, + 0x8e7a8873, 0xbefa00ff, + 0x01000000, 0xbef60072, + 0x8072ff72, 0x00000400, + 0xbefc0084, 0xbf11087c, + 0x8073ff73, 0x00008000, + 0xe0524000, 0x721e0000, + 0xe0524100, 0x721e0100, + 0xe0524200, 0x721e0200, + 0xe0524300, 0x721e0300, + 0xbf8c0f70, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0x807c847c, + 0x8072ff72, 0x00000400, + 0xbf0a737c, 0xbf85ffee, + 0xbf9c0000, 0xe0524000, + 0x761e0000, 0xe0524100, + 0x761e0100, 0xe0524200, + 0x761e0200, 0xe0524300, + 0x761e0300, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0x80f2c072, 0xb8f31605, + 0x80738173, 0x8e738473, + 0x8e7a8273, 0xbefa00ff, + 0x01000000, 0xbefc0073, + 0xc031003c, 0x00000072, + 0x80f2c072, 0xbf8c007f, + 0x80fc907c, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff1, 0xb8f22a05, + 0x80728172, 0x8e728a72, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x80727672, + 0xbefa0084, 0xbefa00ff, + 0x01000000, 0xc0211cfc, + 0x00000072, 0x80728472, + 0xc0211c3c, 0x00000072, + 0x80728472, 0xc0211c7c, + 0x00000072, 0x80728472, + 0xc0211bbc, 0x00000072, + 0x80728472, 0xc0211bfc, + 0x00000072, 0x80728472, + 0xc0211d3c, 0x00000072, + 0x80728472, 0xc0211d7c, + 0x00000072, 0x80728472, + 0xc0211a3c, 0x00000072, + 0x80728472, 0xc0211a7c, + 0x00000072, 0x80728472, + 0xc0211dfc, 0x00000072, + 0x80728472, 0xc0211b3c, + 0x00000072, 0x80728472, + 0xc0211b7c, 0x00000072, + 0x80728472, 0xbf8c007f, + 0x8671ff71, 0x0000ffff, + 0xbefc0073, 0xbefe006e, + 0xbeff006f, 0x867375ff, + 0x000003ff, 0xb9734803, + 0x867375ff, 0xfffff800, + 0x8f738b73, 0xb973a2c3, + 0xb977f801, 0x8673ff71, + 0xf0000000, 0x8f739c73, + 0x8e739073, 0xbef60080, + 0x87767376, 0x8673ff71, + 0x08000000, 0x8f739b73, + 0x8e738f73, 0x87767376, + 0x8673ff74, 0x00800000, + 0x8f739773, 0xb976f807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb974f802, 0xbf8a0000, + 0x95807370, 0xbf810000, +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 0bb9c57..bd2957c 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -20,12 +20,9 @@ * OTHER DEALINGS IN THE SOFTWARE. */ -/* To compile this assembly code: - * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex - */ - -/* HW (GFX9) source code for CWSR trap handler */ -/* Version 18 + multiple trap handler */ +#if 0 +HW (GFX9) source code for CWSR trap handler +#Version 18 + multiple trap handler // this performance-optimal version was originally from Seven Xu at SRDC @@ -77,7 +74,7 @@ var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_D /*************************************************************************/ /* control on how to run the shader */ /*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) +//any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) var EMU_RUN_HACK = 0 var EMU_RUN_HACK_RESTORE_NORMAL = 0 var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 @@ -89,9 +86,9 @@ var WG_BASE_ADDR_HI = 0x0 var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem var CTX_SAVE_CONTROL = 0x0 var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) +var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes +var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //becasue TC EMU curently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency @@ -100,13 +97,8 @@ var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing /**************************************************************************/ var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 -var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 -var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 -var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 -var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 -var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 @@ -130,14 +122,11 @@ var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME -var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 -var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 26 // bits [31:26] unused by SPI debug data -var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0xFC000000 /* Save */ var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes @@ -158,11 +147,11 @@ var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi -var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} +var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 -var s_save_tmp = ttmp4 +var s_save_status = ttmp4 var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine var s_save_xnack_mask_lo = ttmp6 var s_save_xnack_mask_hi = ttmp7 @@ -170,12 +159,11 @@ var s_save_buf_rsrc0 = ttmp8 var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 -var s_save_status = ttmp12 + var s_save_mem_offset = ttmp14 var s_save_alloc_size = s_save_trapsts //conflict +var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) var s_save_m0 = ttmp15 -var s_save_ttmps_lo = s_save_tmp //no conflict -var s_save_ttmps_hi = s_save_trapsts //no conflict /* Restore */ var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE @@ -198,7 +186,7 @@ var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp2 +var s_restore_tmp = ttmp6 var s_restore_mem_offset_save = s_restore_tmp //no conflict var s_restore_m0 = s_restore_alloc_size //no conflict @@ -217,8 +205,6 @@ var s_restore_buf_rsrc0 = ttmp8 var s_restore_buf_rsrc1 = ttmp9 var s_restore_buf_rsrc2 = ttmp10 var s_restore_buf_rsrc3 = ttmp11 -var s_restore_ttmps_lo = s_restore_tmp //no conflict -var s_restore_ttmps_hi = s_restore_alloc_size //no conflict /**************************************************************************/ /* trap handler entry points */ @@ -249,25 +235,25 @@ L_SKIP_RESTORE: s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE //this is the operation for save // ********* Handle non-CWSR traps ******************* if (!EMU_RUN_HACK) // Illegal instruction is a non-maskable exception which blocks context save. // Halt the wavefront and return from the trap. - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK + s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK s_cbranch_scc1 L_HALT_WAVE // If STATUS.MEM_VIOL is asserted then we cannot fetch from the TMA. // Instead, halt the wavefront and return from the trap. - s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_FETCH_2ND_TRAP + s_and_b32 ttmp8, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_MEM_VIOL L_HALT_WAVE: // If STATUS.HALT is set then this fault must come from SQC instruction fetch. // We cannot prevent further faults so just terminate the wavefront. - s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + s_and_b32 ttmp8, s_save_status, SQ_WAVE_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_ALREADY_HALTED s_endpgm L_NOT_ALREADY_HALTED: @@ -278,31 +264,19 @@ L_NOT_ALREADY_HALTED: s_sub_u32 ttmp0, ttmp0, 0x8 s_subb_u32 ttmp1, ttmp1, 0x0 -L_FETCH_2ND_TRAP: - // Preserve and clear scalar XNACK state before issuing scalar reads. - // Save IB_STS.FIRST_REPLAY[15] and IB_STS.RCNT[20:16] into unused space ttmp11[31:26]. - s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) - s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_andn2_b32 ttmp11, ttmp11, TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK - s_or_b32 ttmp11, ttmp11, ttmp3 - - s_andn2_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - - // Read second-level TBA/TMA from first-level TMA and jump if available. - // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) - // ttmp12 holds SQ_WAVE_STATUS - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA - s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA - s_waitcnt lgkmcnt(0) - s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] - s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set - s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + s_branch L_EXCP_CASE + +L_NO_MEM_VIOL: + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ + s_getreg_b32 ttmp14,hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 ttmp15,hwreg(HW_REG_SQ_SHADER_TMA_HI) + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [ttmp14, ttmp15], 0 + s_waitcnt lgkmcnt(0) + s_or_b32 ttmp7, ttmp8, ttmp9 + s_cbranch_scc0 L_NO_NEXT_TRAP //next level trap handler not been set + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_setpc_b64 [ttmp8,ttmp9] //jump to next level trap handler L_NO_NEXT_TRAP: s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) @@ -312,18 +286,8 @@ L_NO_NEXT_TRAP: s_addc_u32 ttmp1, ttmp1, 0 L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF - - // Restore SQ_WAVE_IB_STS. - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - - // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - set_status_without_spi_prio(s_save_status, ttmp2) - - s_rfe_b64 [ttmp0, ttmp1] + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC) + s_rfe_b64 [ttmp0, ttmp1] end // ********* End handling of non-CWSR traps ******************* @@ -343,6 +307,8 @@ end s_mov_b32 s_save_tmp, 0 //clear saveCtx bit s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK + s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi //save XNACK must before any memory operation s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp @@ -370,10 +336,6 @@ end s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC end - // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. - s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) - s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp - L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 @@ -388,6 +350,7 @@ if G8SR_DEBUG_TIMESTAMP s_waitcnt lgkmcnt(0) end + /* setup Resource Contants */ if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) //calculate wd_addr using absolute thread id v_readlane_b32 s_save_tmp, v9, 0 @@ -405,24 +368,7 @@ end else end - // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic - // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 - get_vgpr_size_bytes(s_save_ttmps_lo) - get_sgpr_size_bytes(s_save_ttmps_hi) - s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_ttmps_hi - s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo - s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 - s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF - s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 - ack_sqc_store_workaround() - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 - ack_sqc_store_workaround() - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 - ack_sqc_store_workaround() - s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 - ack_sqc_store_workaround() - /* setup Resource Contants */ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE @@ -479,8 +425,8 @@ end s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) //TRAPSTS - write_hwreg_to_mem(xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO - write_hwreg_to_mem(xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI + write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_LO + write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset) //XNACK_MASK_HI //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE @@ -556,8 +502,6 @@ end s_mov_b32 s_save_mem_offset, 0 s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on s_mov_b32 exec_hi, 0xFFFFFFFF - s_mov_b32 xnack_mask_lo, 0x0 - s_mov_b32 xnack_mask_hi, 0x0 if (SWIZZLE_EN) s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? @@ -1071,6 +1015,8 @@ end s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) @@ -1092,21 +1038,6 @@ end s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - - // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic - // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 - get_vgpr_size_bytes(s_restore_ttmps_lo) - get_sgpr_size_bytes(s_restore_ttmps_hi) - s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_ttmps_hi - s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 - s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 - s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 - s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 - s_waitcnt lgkmcnt(0) - //reuse s_restore_m0 as a temp register s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT @@ -1121,12 +1052,11 @@ end s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp - s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - set_status_without_spi_prio(s_restore_status, s_restore_tmp) // SCC is included, which is changed by previous salu + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu - s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time if G8SR_DEBUG_TIMESTAMP s_memrealtime s_g8sr_ts_restore_d @@ -1155,7 +1085,9 @@ function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on s_mov_b32 m0, s_mem_offset s_buffer_store_dword s, s_rsrc, m0 glc:1 - ack_sqc_store_workaround() +if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) +end s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo end @@ -1165,13 +1097,21 @@ end function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 - ack_sqc_store_workaround() +if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) +end s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 - ack_sqc_store_workaround() +if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) +end s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 - ack_sqc_store_workaround() +if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) +end s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 - ack_sqc_store_workaround() +if ACK_SQC_STORE + s_waitcnt lgkmcnt(0) +end s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 // +scc end @@ -1211,16 +1151,261 @@ function get_hwreg_size_bytes return 128 //HWREG size 128 bytes end -function ack_sqc_store_workaround - if ACK_SQC_STORE - s_waitcnt lgkmcnt(0) - end -end -function set_status_without_spi_prio(status, tmp) - // Do not restore STATUS.SPI_PRIO since scheduler may have raised it. - s_lshr_b32 tmp, status, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT - s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE), tmp - s_nop 0x2 // avoid S_SETREG => S_SETREG hazard - s_setreg_b32 hwreg(HW_REG_STATUS, SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT, SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE), status -end + +#endif + +static const uint32_t cwsr_trap_gfx9_hex[] = { + 0xbf820001, 0xbf820130, + 0xb8f0f802, 0x89708670, + 0xb8f1f803, 0x8674ff71, + 0x00000400, 0xbf850023, + 0x8674ff71, 0x00000800, + 0xbf850003, 0x8674ff71, + 0x00000100, 0xbf840009, + 0x8674ff70, 0x00002000, + 0xbf840001, 0xbf810000, + 0x8770ff70, 0x00002000, + 0x80ec886c, 0x82ed806d, + 0xbf820010, 0xb8faf812, + 0xb8fbf813, 0x8efa887a, + 0xc00a1d3d, 0x00000000, + 0xbf8cc07f, 0x87737574, + 0xbf840002, 0xb970f802, + 0xbe801d74, 0xb8f1f803, + 0x8671ff71, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, + 0x0000ffff, 0xb970f802, + 0xbe801f6c, 0x866dff6d, + 0x0000ffff, 0xbef60080, + 0xb9760283, 0xbef20068, + 0xbef30069, 0xb8f62407, + 0x8e769c76, 0x876d766d, + 0xb8f603c7, 0x8e769b76, + 0x876d766d, 0xb8f6f807, + 0x8676ff76, 0x00007fff, + 0xb976f807, 0xbeee007e, + 0xbeef007f, 0xbefe0180, + 0xbf900004, 0xbf8e0002, + 0xbf88fffe, 0xbef4007e, + 0x8675ff7f, 0x0000ffff, + 0x8775ff75, 0x00040000, + 0xbef60080, 0xbef700ff, + 0x00807fac, 0x8676ff7f, + 0x08000000, 0x8f768376, + 0x87777677, 0x8676ff7f, + 0x70000000, 0x8f768176, + 0x87777677, 0xbefb007c, + 0xbefa0080, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f61605, 0x80768176, + 0x8e768676, 0x807a767a, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xbefe007c, + 0xbefc007a, 0xc0611efa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611b3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611b7a, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611bba, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611bfa, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611c3a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8f1f803, + 0xbefe007c, 0xbefc007a, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xbefe007c, + 0xbefc007a, 0xc0611cba, + 0x0000007c, 0xbf8cc07f, + 0x807a847a, 0xbefc007e, + 0xbefe007c, 0xbefc007a, + 0xc0611cfa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0xb8fbf801, + 0xbefe007c, 0xbefc007a, + 0xc0611efa, 0x0000007c, + 0xbf8cc07f, 0x807a847a, + 0xbefc007e, 0x8676ff7f, + 0x04000000, 0xbeef0080, + 0x876f6f76, 0xb8fa2a05, + 0x807a817a, 0x8e7a8a7a, + 0xb8f11605, 0x80718171, + 0x8e718471, 0x8e768271, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747a74, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a717c, 0xbf85ffe7, + 0xbef40172, 0xbefa0080, + 0xbefe00c1, 0xbeff00c1, + 0xbef600ff, 0x01000000, + 0xe0724000, 0x7a1d0000, + 0xe0724100, 0x7a1d0100, + 0xe0724200, 0x7a1d0200, + 0xe0724300, 0x7a1d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8f14306, 0x8671c171, + 0xbf84002c, 0xbf8a0000, + 0x8676ff6f, 0x04000000, + 0xbf840028, 0x8e718671, + 0x8e718271, 0xbef60071, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8f61605, + 0x80768176, 0x8e768676, + 0x807a767a, 0x807aff7a, + 0x00000080, 0xbef600ff, + 0x01000000, 0xbefc0080, + 0xd28c0002, 0x000100c1, + 0xd28d0003, 0x000204c1, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x7a1d0002, + 0x68040702, 0xd0c9006a, + 0x0000e302, 0xbf87fff7, + 0xbef70000, 0xbefa00ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8f12a05, + 0x80718171, 0x8e718271, + 0x8e768871, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a717c, 0xbf840015, + 0xbf11017c, 0x8071ff71, + 0x00001000, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0xe0724000, + 0x7a1d0000, 0xe0724100, + 0x7a1d0100, 0xe0724200, + 0x7a1d0200, 0xe0724300, + 0x7a1d0300, 0x807c847c, + 0x807aff7a, 0x00000400, + 0xbf0a717c, 0xbf85ffef, + 0xbf9c0000, 0xbf8200c5, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x8672ff7f, 0x08000000, + 0x8f728372, 0x87777277, + 0x8672ff7f, 0x70000000, + 0x8f728172, 0x87777277, + 0x8672ff7f, 0x04000000, + 0xbf84001e, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf840019, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8f21605, 0x80728172, + 0x8e728672, 0x80787278, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbef80080, + 0xbefe00c1, 0xbeff00c1, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x8e76886f, + 0xbef600ff, 0x01000000, + 0xbef20078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0x806fff6f, + 0x00008000, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x721d0000, + 0xe0524100, 0x721d0100, + 0xe0524200, 0x721d0200, + 0xe0524300, 0x721d0300, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8f21605, + 0x80728172, 0x8e728672, + 0x80787278, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, + 0xbef600ff, 0x01000000, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8f21605, 0x80728172, + 0x8e728672, 0x80787278, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0x866dff6d, 0x0000ffff, + 0xbefc006f, 0xbefe007a, + 0xbeff007b, 0x866f71ff, + 0x000003ff, 0xb96f4803, + 0x866f71ff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0x866fff6d, + 0xf0000000, 0x8f6f9c6f, + 0x8e6f906f, 0xbef20080, + 0x87726f72, 0x866fff6d, + 0x08000000, 0x8f6f9b6f, + 0x8e6f8f6f, 0x87726f72, + 0x866fff70, 0x00800000, + 0x8f6f976f, 0xb972f807, + 0x86fe7e7e, 0x86ea6a6a, + 0xb970f802, 0xbf8a0000, + 0x95806f6c, 0xbf810000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 56c1230..01c8b19 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -36,7 +35,6 @@ #include #include #include -#include #include "kfd_priv.h" #include "kfd_device_queue_manager.h" @@ -46,6 +44,7 @@ static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); static int kfd_mmap(struct file *, struct vm_area_struct *); +static bool kfd_dev_is_large_bar(struct kfd_dev *dev); static const char kfd_dev_name[] = "kfd"; @@ -137,9 +136,6 @@ static int kfd_open(struct inode *inode, struct file *filep) if (IS_ERR(process)) return PTR_ERR(process); - if (kfd_is_locked()) - return -EAGAIN; - dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", process->pasid, process->is_32bit_user_mode); @@ -251,7 +247,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, pr_debug("Queue Size: 0x%llX, %u\n", q_properties->queue_size, args->ring_size); - pr_debug("Queue r/w Pointers: %px, %px\n", + pr_debug("Queue r/w Pointers: %p, %p\n", q_properties->read_ptr, q_properties->write_ptr); @@ -903,7 +899,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, mutex_lock(&p->mutex); if (!kfd_has_process_device_data(p)) - goto out_unlock; + goto out_upwrite; /* Run over all pdd of the process */ pdd = kfd_get_first_process_device_data(p); @@ -912,7 +908,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, pdd = kfd_get_next_process_device_data(p, pdd); } while (pdd); - goto out_unlock; + goto out_upwrite; } /* Fill in process-aperture information for all available @@ -929,7 +925,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, if (!kfd_has_process_device_data(p)) { args->num_of_nodes = 0; kfree(pa); - goto out_unlock; + goto out_upwrite; } /* Run over all pdd of the process */ @@ -971,7 +967,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, kfree(pa); return ret ? -EFAULT : 0; -out_unlock: +out_upwrite: mutex_unlock(&p->mutex); return 0; } @@ -980,70 +976,55 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_create_event_args *args = data; - int err; - - /* For dGPUs the event page is allocated in user mode. The - * handle is passed to KFD with the first call to this IOCTL - * through the event_page_offset field. - */ - if (args->event_page_offset) { - struct kfd_dev *kfd; - struct kfd_process_device *pdd; - void *mem, *kern_addr; - uint64_t size; + struct kfd_dev *kfd; + struct kfd_process_device *pdd; + int err = -EINVAL; + void *mem, *kern_addr = NULL; - if (p->signal_page) { - pr_err("Event page is already set\n"); - return -EINVAL; - } + pr_debug("Event page offset 0x%llx\n", args->event_page_offset); + if (args->event_page_offset) { kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); if (!kfd) { pr_err("Getting device by id failed in %s\n", __func__); - return -EINVAL; + return -EFAULT; } - - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(kfd, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto out_unlock; - } - - mem = kfd_process_device_translate_handle(pdd, + if (!kfd->device_info->needs_iommu_device) { + mutex_lock(&p->mutex); + pdd = kfd_bind_process_to_device(kfd, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto out_upwrite; + } + mem = kfd_process_device_translate_handle(pdd, GET_IDR_HANDLE(args->event_page_offset)); - if (!mem) { - pr_err("Can't find BO, offset is 0x%llx\n", - args->event_page_offset); - err = -EINVAL; - goto out_unlock; - } - mutex_unlock(&p->mutex); - - err = kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, - mem, &kern_addr, &size); - if (err) { - pr_err("Failed to map event page to kernel\n"); - return err; - } + if (!mem) { + pr_err("Can't find BO, offset is 0x%llx\n", + args->event_page_offset); + err = -EFAULT; + goto out_upwrite; + } + mutex_unlock(&p->mutex); - err = kfd_event_page_set(p, kern_addr, size); - if (err) { - pr_err("Failed to set event page\n"); - return err; + /* Map dGPU gtt BO to kernel */ + kfd->kfd2kgd->map_gtt_bo_to_kernel(kfd->kgd, + mem, &kern_addr, NULL); } } - - err = kfd_event_create(filp, p, args->event_type, - args->auto_reset != 0, args->node_id, - &args->event_id, &args->event_trigger_data, - &args->event_page_offset, - &args->event_slot_index); + err = kfd_event_create(filp, p, + args->event_type, + args->auto_reset != 0, + args->node_id, + &args->event_id, + &args->event_trigger_data, + &args->event_page_offset, + &args->event_slot_index, + kern_addr); return err; -out_unlock: +out_upwrite: mutex_unlock(&p->mutex); return err; } @@ -1085,14 +1066,17 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, return err; } -static int kfd_ioctl_set_scratch_backing_va(struct file *filep, +static int kfd_ioctl_alloc_scratch_memory(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_set_scratch_backing_va_args *args = data; + struct kfd_ioctl_alloc_memory_of_scratch_args *args = data; struct kfd_process_device *pdd; struct kfd_dev *dev; long err; + if (args->size == 0) + return -EINVAL; + dev = kfd_device_by_id(args->gpu_id); if (!dev) return -EINVAL; @@ -1242,8 +1226,6 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, uint64_t offset = args->mmap_offset; uint32_t flags = args->flags; struct vm_area_struct *vma; - uint64_t cpuva = 0; - unsigned int mem_type = 0; if (args->size == 0) return -EINVAL; @@ -1273,13 +1255,6 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, flags |= KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL; flags &= ~KFD_IOC_ALLOC_MEM_FLAGS_USERPTR; offset = (pfn << PAGE_SHIFT); - } else { - if (offset & (PAGE_SIZE - 1)) { - pr_debug("Unaligned userptr address:%llx\n", - offset); - return -EINVAL; - } - cpuva = offset; } } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { if (args->size != kfd_doorbell_process_slice(dev)) @@ -1297,18 +1272,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, err = dev->kfd2kgd->alloc_memory_of_gpu( dev->kgd, args->va_addr, args->size, - pdd->vm, NULL, (struct kgd_mem **) &mem, &offset, + pdd->vm, (struct kgd_mem **) &mem, &offset, flags); if (err) goto err_unlock; - mem_type = flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | - KFD_IOC_ALLOC_MEM_FLAGS_GTT | - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | - KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL); idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - args->va_addr, args->size, cpuva, mem_type, NULL); + args->va_addr, args->size, NULL); if (idr_handle < 0) { err = -EFAULT; goto err_free; @@ -1322,7 +1293,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, return 0; err_free: - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem); + dev->kfd2kgd->free_memory_of_gpu(dev->kgd, + (struct kgd_mem *) mem); err_unlock: mutex_unlock(&p->mutex); return err; @@ -1363,7 +1335,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, /* If freeing the buffer failed, leave the handle in place for * clean-up during process tear-down. */ - if (!ret) + if (ret == 0) kfd_process_device_remove_obj_handle( pdd, GET_IDR_HANDLE(args->handle)); @@ -1380,30 +1352,31 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, void *mem; struct kfd_dev *dev, *peer; long err = 0; - int i; + int i, num_dev = 0; uint32_t *devices_arr = NULL; dev = kfd_device_by_id(GET_GPU_ID(args->handle)); if (!dev) return -EINVAL; - if (!args->n_devices) { - pr_debug("Device IDs array empty\n"); + if (args->device_ids_array_size == 0) { + pr_debug("Device ID array size is 0\n"); return -EINVAL; } - if (args->n_success > args->n_devices) { - pr_debug("n_success exceeds n_devices\n"); + + if (args->device_ids_array_size % sizeof(uint32_t)) { + pr_debug("Node IDs array size %u\n", + args->device_ids_array_size); return -EINVAL; } - devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), - GFP_KERNEL); + devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); if (!devices_arr) return -ENOMEM; err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->n_devices * sizeof(*devices_arr)); + (void __user *)args->device_ids_array_ptr, + args->device_ids_array_size); if (err != 0) { err = -EFAULT; goto copy_from_user_failed; @@ -1424,11 +1397,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, goto get_mem_obj_from_handle_failed; } - for (i = args->n_success; i < args->n_devices; i++) { + num_dev = args->device_ids_array_size / sizeof(uint32_t); + for (i = 0 ; i < num_dev; i++) { peer = kfd_device_by_id(devices_arr[i]); if (!peer) { pr_debug("Getting device by id failed for 0x%x\n", - devices_arr[i]); + devices_arr[i]); err = -EINVAL; goto get_mem_obj_from_handle_failed; } @@ -1439,13 +1413,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, goto get_mem_obj_from_handle_failed; } err = peer->kfd2kgd->map_memory_to_gpu( - peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); - if (err) { - pr_err("Failed to map to gpu %d/%d\n", - i, args->n_devices); + peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); + if (err != 0) { + pr_err("Failed to map to gpu %d, num_dev=%d\n", + i, num_dev); goto map_memory_to_gpu_failed; } - args->n_success = i+1; } mutex_unlock(&p->mutex); @@ -1457,7 +1430,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, } /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { + for (i = 0; i < num_dev; i++) { peer = kfd_device_by_id(devices_arr[i]); if (WARN_ON_ONCE(!peer)) continue; @@ -1490,29 +1463,30 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, void *mem; struct kfd_dev *dev, *peer; long err = 0; - uint32_t *devices_arr = NULL, i; + uint32_t *devices_arr = NULL, num_dev, i; dev = kfd_device_by_id(GET_GPU_ID(args->handle)); if (!dev) return -EINVAL; - if (!args->n_devices) { - pr_debug("Device IDs array empty\n"); + if (args->device_ids_array_size == 0) { + pr_debug("Device ID array size is 0\n"); return -EINVAL; } - if (args->n_success > args->n_devices) { - pr_debug("n_success exceeds n_devices\n"); + + if (args->device_ids_array_size % sizeof(uint32_t)) { + pr_debug("Node IDs array size %u\n", + args->device_ids_array_size); return -EINVAL; } - devices_arr = kmalloc_array(args->n_devices, sizeof(*devices_arr), - GFP_KERNEL); + devices_arr = kmalloc(args->device_ids_array_size, GFP_KERNEL); if (!devices_arr) return -ENOMEM; err = copy_from_user(devices_arr, - (void __user *)args->device_ids_array_ptr, - args->n_devices * sizeof(*devices_arr)); + (void __user *)args->device_ids_array_ptr, + args->device_ids_array_size); if (err != 0) { err = -EFAULT; goto copy_from_user_failed; @@ -1522,7 +1496,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, pdd = kfd_get_process_device_data(dev, p); if (!pdd) { - err = -EINVAL; + pr_debug("Process device data doesn't exist\n"); + err = -ENODEV; goto bind_process_to_device_failed; } @@ -1533,7 +1508,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, goto get_mem_obj_from_handle_failed; } - for (i = args->n_success; i < args->n_devices; i++) { + num_dev = args->device_ids_array_size / sizeof(uint32_t); + for (i = 0 ; i < num_dev; i++) { peer = kfd_device_by_id(devices_arr[i]); if (!peer) { err = -EINVAL; @@ -1549,10 +1525,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); if (err) { pr_err("Failed to unmap from gpu %d/%d\n", - i, args->n_devices); + i, num_dev); goto unmap_memory_from_gpu_failed; } - args->n_success = i+1; } kfree(devices_arr); @@ -1569,6 +1544,34 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, return err; } +static int kfd_ioctl_set_process_dgpu_aperture(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_set_process_dgpu_aperture_args *args = data; + struct kfd_dev *dev; + struct kfd_process_device *pdd; + long err; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + mutex_lock(&p->mutex); + + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + err = PTR_ERR(pdd); + goto exit; + } + + err = kfd_set_process_dgpu_aperture(pdd, args->dgpu_base, + args->dgpu_limit); + +exit: + mutex_unlock(&p->mutex); + return err; +} + static int kfd_ioctl_get_dmabuf_info(struct file *filep, struct kfd_process *p, void *data) { @@ -1683,636 +1686,22 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep, return r; } -/* Maximum number of entries for process pages array which lives on stack */ -#define MAX_PP_STACK_COUNT 16 -/* Maximum number of pages kmalloc'd to hold struct page's during copy */ -#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2) -#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *)) - -static void kfd_put_sg_table(struct sg_table *sg) -{ - unsigned int i; - struct scatterlist *s; - - for_each_sg(sg->sgl, s, sg->nents, i) - put_page(sg_page(s)); -} - - -/* Create a sg table for the given userptr BO by pinning its system pages - * @bo: userptr BO - * @offset: Offset into BO - * @mm/@task: mm_struct & task_struct of the process that holds the BO - * @size: in/out: desired size / actual size which could be smaller - * @sg_size: out: Size of sg table. This is ALIGN_UP(@size) - * @ret_sg: out sg table - */ -static int kfd_create_sg_table_from_userptr_bo(struct kfd_bo *bo, - int64_t offset, int cma_write, - struct mm_struct *mm, - struct task_struct *task, - uint64_t *size, - uint64_t *sg_size, - struct sg_table **ret_sg) -{ - int ret, locked = 1; - struct sg_table *sg = NULL; - unsigned int i, offset_in_page, flags = 0; - unsigned long nents, n; - unsigned long pa = (bo->cpuva + offset) & PAGE_MASK; - unsigned int cur_page = 0; - struct scatterlist *s; - uint64_t sz = *size; - struct page **process_pages; - - *sg_size = 0; - sg = kmalloc(sizeof(*sg), GFP_KERNEL); - if (!sg) - return -ENOMEM; - - offset_in_page = offset & (PAGE_SIZE - 1); - nents = (sz + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE; - - ret = sg_alloc_table(sg, nents, GFP_KERNEL); - if (unlikely(ret)) { - ret = -ENOMEM; - goto sg_alloc_fail; - } - process_pages = kmalloc_array(nents, sizeof(struct pages *), - GFP_KERNEL); - if (!process_pages) { - ret = -ENOMEM; - goto page_alloc_fail; - } - - if (cma_write) - flags = FOLL_WRITE; - locked = 1; - down_read(&mm->mmap_sem); - n = get_user_pages_remote(task, mm, pa, nents, flags, process_pages, - NULL, &locked); - if (locked) - up_read(&mm->mmap_sem); - if (n <= 0) { - pr_err("CMA: Invalid virtual address 0x%lx\n", pa); - ret = -EFAULT; - goto get_user_fail; - } - if (n != nents) { - /* Pages pinned < requested. Set the size accordingly */ - *size = (n * PAGE_SIZE) - offset_in_page; - pr_debug("Requested %lx but pinned %lx\n", nents, n); - } - - sz = 0; - for_each_sg(sg->sgl, s, n, i) { - sg_set_page(s, process_pages[cur_page], PAGE_SIZE, - offset_in_page); - sg_dma_address(s) = page_to_phys(process_pages[cur_page]); - offset_in_page = 0; - cur_page++; - sz += PAGE_SIZE; - } - *ret_sg = sg; - *sg_size = sz; - - kfree(process_pages); - return 0; - -get_user_fail: - kfree(process_pages); -page_alloc_fail: - sg_free_table(sg); -sg_alloc_fail: - kfree(sg); - return ret; -} - -static void kfd_free_cma_bos(struct cma_iter *ci) -{ - struct cma_system_bo *cma_bo, *tmp; - - list_for_each_entry_safe(cma_bo, tmp, &ci->cma_list, list) { - struct kfd_dev *dev = cma_bo->dev; - - /* sg table is deleted by free_memory_of_gpu */ - if (cma_bo->sg) - kfd_put_sg_table(cma_bo->sg); - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem); - list_del(&cma_bo->list); - kfree(cma_bo); - } -} - -/* 1 second timeout */ -#define CMA_WAIT_TIMEOUT msecs_to_jiffies(1000) - -static int kfd_cma_fence_wait(struct dma_fence *f) -{ - int ret; - - ret = dma_fence_wait_timeout(f, false, CMA_WAIT_TIMEOUT); - if (likely(ret > 0)) - return 0; - if (!ret) - ret = -ETIME; - return ret; -} - -/* Put previous (old) fence @pf but it waits for @pf to signal if the context - * of the current fence @cf is different. - */ -static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf, - struct dma_fence *pf) -{ - int ret = 0; - - if (pf && cf && cf->context != pf->context) - ret = kfd_cma_fence_wait(pf); - dma_fence_put(pf); - return ret; -} - -#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE) - -/* Create an equivalent system BO for the given @bo. If @bo is a userptr then - * create a new system BO by pinning underlying system pages of the given - * userptr BO. If @bo is in Local Memory then create an empty system BO and - * then copy @bo into this new BO. - * @bo: Userptr BO or Local Memory BO - * @offset: Offset into bo - * @size: in/out: The size of the new BO could be less than requested if all - * the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would - * be reflected in @size - * @mm/@task: mm/task to which @bo belongs to - * @cma_bo: out: new system BO - */ -static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo, - uint64_t *size, uint64_t offset, - int cma_write, struct kfd_process *p, - struct mm_struct *mm, - struct task_struct *task, - struct cma_system_bo **cma_bo) -{ - int ret; - struct kfd_process_device *pdd = NULL; - struct cma_system_bo *cbo; - uint64_t bo_size = 0; - struct dma_fence *f; - - uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_WRITABLE | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE; - - *cma_bo = NULL; - cbo = kzalloc(sizeof(**cma_bo), GFP_KERNEL); - if (!cbo) - return -ENOMEM; - - INIT_LIST_HEAD(&cbo->list); - if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) - bo_size = min_t(uint64_t, *size, MAX_SYSTEM_BO_SIZE); - else if (bo->cpuva) { - ret = kfd_create_sg_table_from_userptr_bo(bo, offset, - cma_write, mm, task, - size, &bo_size, - &cbo->sg); - if (ret) { - pr_err("CMA: BO create with sg failed %d\n", ret); - goto sg_fail; - } - } else { - WARN_ON(1); - ret = -EINVAL; - goto sg_fail; - } - mutex_lock(&p->mutex); - pdd = kfd_get_process_device_data(kdev, p); - if (!pdd) { - mutex_unlock(&p->mutex); - pr_err("Process device data doesn't exist\n"); - ret = -EINVAL; - goto pdd_fail; - } - - ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size, - pdd->vm, cbo->sg, - &cbo->mem, NULL, flags); - mutex_unlock(&p->mutex); - if (ret) { - pr_err("Failed to create shadow system BO %d\n", ret); - goto pdd_fail; - } - - if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { - ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem, - offset, cbo->mem, 0, - bo_size, &f, size); - if (ret) { - pr_err("CMA: Intermediate copy failed %d\n", ret); - goto copy_fail; - } - - /* Wait for the copy to finish as subsequent copy will be done - * by different device - */ - ret = kfd_cma_fence_wait(f); - dma_fence_put(f); - if (ret) { - pr_err("CMA: Intermediate copy timed out %d\n", ret); - goto copy_fail; - } - } - - cbo->dev = kdev; - *cma_bo = cbo; - - return ret; - -copy_fail: - kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem); -pdd_fail: - if (cbo->sg) { - kfd_put_sg_table(cbo->sg); - sg_free_table(cbo->sg); - kfree(cbo->sg); - } -sg_fail: - kfree(cbo); - return ret; -} - -/* Update cma_iter.cur_bo with KFD BO that is assocaited with - * cma_iter.array.va_addr - */ -static int kfd_cma_iter_update_bo(struct cma_iter *ci) -{ - struct kfd_memory_range *arr = ci->array; - uint64_t va_end = arr->va_addr + arr->size - 1; - - mutex_lock(&ci->p->mutex); - ci->cur_bo = kfd_process_find_bo_from_interval(ci->p, arr->va_addr, - va_end); - mutex_unlock(&ci->p->mutex); - - if (!ci->cur_bo || va_end > ci->cur_bo->it.last) { - pr_err("CMA failed. Range out of bounds\n"); - return -EFAULT; - } - return 0; -} - -/* Advance iter by @size bytes. */ -static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size) -{ - int ret = 0; - - ci->offset += size; - if (WARN_ON(size > ci->total || ci->offset > ci->array->size)) - return -EFAULT; - ci->total -= size; - /* If current range is copied, move to next range if available. */ - if (ci->offset == ci->array->size) { - - /* End of all ranges */ - if (!(--ci->nr_segs)) - return 0; - - ci->array++; - ci->offset = 0; - ret = kfd_cma_iter_update_bo(ci); - if (ret) - return ret; - } - ci->bo_offset = (ci->array->va_addr + ci->offset) - - ci->cur_bo->it.start; - return ret; -} - -static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs, - struct kfd_process *p, struct mm_struct *mm, - struct task_struct *task, struct cma_iter *ci) -{ - int ret; - int nr; - - if (!arr || !segs) - return -EINVAL; - - memset(ci, 0, sizeof(*ci)); - INIT_LIST_HEAD(&ci->cma_list); - ci->array = arr; - ci->nr_segs = segs; - ci->p = p; - ci->offset = 0; - ci->mm = mm; - ci->task = task; - for (nr = 0; nr < segs; nr++) - ci->total += arr[nr].size; - - /* Valid but size is 0. So copied will also be 0 */ - if (!ci->total) - return 0; - - ret = kfd_cma_iter_update_bo(ci); - if (!ret) - ci->bo_offset = arr->va_addr - ci->cur_bo->it.start; - return ret; -} - -static bool kfd_cma_iter_end(struct cma_iter *ci) -{ - if (!(ci->nr_segs) || !(ci->total)) - return true; - return false; -} - -/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes - * both source and dest. BOs are userptr BOs. Both BOs can either belong to - * current process or one of the BOs can belong to a differnt - * process. @Returns 0 on success, -ve on failure - * - * @si: Source iter - * @di: Dest. iter - * @cma_write: Indicates if it is write to remote or read from remote - * @size: amount of bytes to be copied - * @copied: Return number of bytes actually copied. - */ -static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di, - bool cma_write, uint64_t size, - uint64_t *copied) -{ - int i, ret = 0, locked; - unsigned int nents, nl; - unsigned int offset_in_page; - struct page *pp_stack[MAX_PP_STACK_COUNT]; - struct page **process_pages = pp_stack; - unsigned long rva, lva = 0, flags = 0; - uint64_t copy_size, to_copy = size; - struct cma_iter *li, *ri; - - if (cma_write) { - ri = di; - li = si; - flags |= FOLL_WRITE; - } else { - li = di; - ri = si; - } - /* rva: remote virtual address. Page aligned to start page. - * rva + offset_in_page: Points to remote start address - * lva: local virtual address. Points to the start address. - * nents: computes number of remote pages to request - */ - offset_in_page = ri->bo_offset & (PAGE_SIZE - 1); - rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK; - lva = li->cur_bo->cpuva + li->bo_offset; - - nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE; - - copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page); - *copied = 0; - - if (nents > MAX_PP_STACK_COUNT) { - /* For reliability kmalloc only 2 pages worth */ - process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES, - sizeof(struct pages *)*nents), - GFP_KERNEL); - - if (!process_pages) - return -ENOMEM; - } - - while (nents && to_copy) { - nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents); - locked = 1; - down_read(&ri->mm->mmap_sem); - nl = get_user_pages_remote(ri->task, ri->mm, rva, nl, - flags, process_pages, NULL, - &locked); - if (locked) - up_read(&ri->mm->mmap_sem); - if (nl <= 0) { - pr_err("CMA: Invalid virtual address 0x%lx\n", rva); - ret = -EFAULT; - break; - } - - for (i = 0; i < nl; i++) { - unsigned int n; - void *kaddr = kmap(process_pages[i]); - - if (cma_write) { - n = copy_from_user(kaddr+offset_in_page, - (void *)lva, copy_size); - set_page_dirty(process_pages[i]); - } else { - n = copy_to_user((void *)lva, - kaddr+offset_in_page, - copy_size); - } - kunmap(kaddr); - if (n) { - ret = -EFAULT; - break; - } - to_copy -= copy_size; - if (!to_copy) - break; - lva += copy_size; - rva += (copy_size + offset_in_page); - WARN_ONCE(rva & (PAGE_SIZE - 1), - "CMA: Error in remote VA computation"); - offset_in_page = 0; - copy_size = min_t(uint64_t, to_copy, PAGE_SIZE); - } - - for (i = 0; i < nl; i++) - put_page(process_pages[i]); - - if (ret) - break; - nents -= nl; - } - - if (process_pages != pp_stack) - kfree(process_pages); - - *copied = (size - to_copy); - return ret; - -} - -/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their - * respective offset. - * @si: Source iter - * @di: Dest. iter - * @cma_write: Indicates if it is write to remote or read from remote - * @size: amount of bytes to be copied - * @f: Return the last fence if any - * @copied: Return number of bytes actually copied. - */ -static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di, - int cma_write, uint64_t size, - struct dma_fence **f, uint64_t *copied) -{ - int err = 0; - struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo; - uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset; - struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem; - struct kfd_dev *dev = dst_bo->dev; - struct cma_system_bo *tmp_bo = NULL; - - *copied = 0; - if (f) - *f = NULL; - if (src_bo->cpuva && dst_bo->cpuva) - return kfd_copy_userptr_bos(si, di, cma_write, size, copied); - - /* If either source or dest. is userptr, create a shadow system BO - * by using the underlying userptr BO pages. Then use this shadow - * BO for copy. src_offset & dst_offset are adjusted because the new BO - * is only created for the window (offset, size) requested. - * The shadow BO is created on the other device. This means if the - * other BO is a device memory, the copy will be using that device. - * The BOs are stored in cma_list for deferred cleanup. This minimizes - * fence waiting just to the last fence. - */ - if (src_bo->cpuva) { - dev = dst_bo->dev; - err = kfd_create_cma_system_bo(dev, src_bo, &size, - si->bo_offset, cma_write, - si->p, si->mm, si->task, - &si->cma_bo); - src_mem = si->cma_bo->mem; - src_offset = si->bo_offset & (PAGE_SIZE - 1); - list_add_tail(&si->cma_bo->list, &si->cma_list); - } else if (dst_bo->cpuva) { - dev = src_bo->dev; - err = kfd_create_cma_system_bo(dev, dst_bo, &size, - di->bo_offset, cma_write, - di->p, di->mm, di->task, - &di->cma_bo); - dst_mem = di->cma_bo->mem; - dst_offset = di->bo_offset & (PAGE_SIZE - 1); - list_add_tail(&di->cma_bo->list, &di->cma_list); - } else if (src_bo->dev->kgd != dst_bo->dev->kgd) { - /* This indicates that atleast on of the BO is in local mem. - * If both are in local mem of different devices then create an - * intermediate System BO and do a double copy - * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM]. - * If only one BO is in VRAM then use that GPU to do the copy - */ - if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM && - dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { - dev = dst_bo->dev; - err = kfd_create_cma_system_bo(src_bo->dev, src_bo, - &size, si->bo_offset, - cma_write, si->p, - si->mm, si->task, - &tmp_bo); - src_mem = tmp_bo->mem; - src_offset = 0; - } else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) - dev = src_bo->dev; - /* else already set to dst_bo->dev */ - } - - if (err) { - pr_err("Failed to create system BO %d", err); - return -EINVAL; - } - - err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset, - dst_mem, dst_offset, size, f, - copied); - /* The tmp_bo allocates additional memory. So it is better to wait and - * delete. Also since multiple GPUs are involved the copies are - * currently not pipelined. - */ - if (tmp_bo) { - if (!err) { - kfd_cma_fence_wait(*f); - dma_fence_put(*f); - *f = NULL; - } - dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem); - kfree(tmp_bo); - } - return err; -} - -/* Copy single range from source iterator @si to destination iterator @di. - * @si will move to next range and @di will move by bytes copied. - * @return : 0 for success or -ve for failure - * @f: The last fence if any - * @copied: out: number of bytes copied - */ -static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di, - bool cma_write, struct dma_fence **f, - uint64_t *copied) -{ - int err = 0; - uint64_t copy_size, n; - uint64_t size = si->array->size; - struct kfd_bo *src_bo = si->cur_bo; - struct dma_fence *lfence = NULL; - - if (!src_bo || !di || !copied) - return -EINVAL; - *copied = 0; - if (f) - *f = NULL; - - while (size && !kfd_cma_iter_end(di)) { - struct dma_fence *fence = NULL; - - copy_size = min(size, (di->array->size - di->offset)); - - err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n); - if (err) { - pr_err("CMA %d failed\n", err); - break; - } - - if (fence) { - err = kfd_fence_put_wait_if_diff_context(fence, - lfence); - lfence = fence; - if (err) - break; - } - - size -= n; - *copied += n; - err = kfd_cma_iter_advance(si, n); - if (err) - break; - err = kfd_cma_iter_advance(di, n); - if (err) - break; - } - - if (f) - *f = dma_fence_get(lfence); - dma_fence_put(lfence); - - return err; -} - static int kfd_ioctl_cross_memory_copy(struct file *filep, struct kfd_process *local_p, void *data) { struct kfd_ioctl_cross_memory_copy_args *args = data; struct kfd_memory_range *src_array, *dst_array; - struct kfd_process *remote_p; + struct kfd_bo *src_bo, *dst_bo; + struct kfd_process *remote_p, *src_p, *dst_p; struct task_struct *remote_task; struct mm_struct *remote_mm; struct pid *remote_pid; - struct dma_fence *lfence = NULL; - uint64_t copied = 0, total_copied = 0; - struct cma_iter di, si; + struct dma_fence *fence = NULL, *lfence = NULL; + uint64_t dst_va_addr; + uint64_t copied, total_copied = 0; + uint64_t src_offset, dst_offset, dst_va_addr_end; const char *cma_op; - int err = 0; + int i, j = 0, err = 0; /* Check parameters */ if (args->src_mem_range_array == 0 || args->dst_mem_range_array == 0 || @@ -2372,76 +1761,169 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep, } remote_p = kfd_get_process(remote_task); - if (IS_ERR(remote_p)) { + if (!remote_p) { pr_err("Cross mem copy failed. Invalid kfd process %d\n", args->pid); err = -EINVAL; goto kfd_process_fail; } - /* Initialise cma_iter si & @di with source & destination range. */ + if (KFD_IS_CROSS_MEMORY_WRITE(args->flags)) { + src_p = local_p; + dst_p = remote_p; cma_op = "WRITE"; pr_debug("CMA WRITE: local -> remote\n"); - err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - remote_p, remote_mm, remote_task, &di); - if (err) - goto kfd_process_fail; - err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - local_p, current->mm, current, &si); - if (err) - goto kfd_process_fail; } else { + src_p = remote_p; + dst_p = local_p; cma_op = "READ"; pr_debug("CMA READ: remote -> local\n"); - - err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size, - local_p, current->mm, current, &di); - if (err) - goto kfd_process_fail; - err = kfd_cma_iter_init(src_array, args->src_mem_array_size, - remote_p, remote_mm, remote_task, &si); - if (err) - goto kfd_process_fail; } - /* Copy one si range at a time into di. After each call to - * kfd_copy_single_range() si will move to next range. di will be - * incremented by bytes copied - */ - while (!kfd_cma_iter_end(&si) && !kfd_cma_iter_end(&di)) { - struct dma_fence *fence = NULL; - - err = kfd_copy_single_range(&si, &di, - KFD_IS_CROSS_MEMORY_WRITE(args->flags), - &fence, &copied); - total_copied += copied; - if (err) + /* For each source kfd_range: + * - Find the BO. Each range has to be within the same BO. + * - Copy this range to single or multiple destination BOs. + * - dst_va_addr - will point to next va address into which data will + * be copied. + * - dst_bo & src_bo - the current destination and source BOs + * - src_offset & dst_offset - offset into the respective BOs from + * data will be sourced or copied + */ + dst_va_addr = dst_array[0].va_addr; + dst_va_addr_end = dst_va_addr + dst_array[0].size - 1; + mutex_lock(&dst_p->mutex); + dst_bo = kfd_process_find_bo_from_interval(dst_p, + dst_va_addr, + dst_va_addr_end); + mutex_unlock(&dst_p->mutex); + if (!dst_bo || dst_va_addr_end > dst_bo->it.last) { + pr_err("CMA %s failed. Invalid dst range\n", cma_op); + err = -EFAULT; + goto kfd_process_fail; + } + dst_offset = dst_va_addr - dst_bo->it.start; + + for (i = 0; i < args->src_mem_array_size; i++) { + uint64_t src_va_addr_end = src_array[i].va_addr + + src_array[i].size - 1; + uint64_t src_size_to_copy = src_array[i].size; + + mutex_lock(&src_p->mutex); + src_bo = kfd_process_find_bo_from_interval(src_p, + src_array[i].va_addr, + src_va_addr_end); + mutex_unlock(&src_p->mutex); + if (!src_bo || src_va_addr_end > src_bo->it.last) { + pr_err("CMA %s failed. Invalid src range\n", cma_op); + err = -EFAULT; break; + } + + src_offset = src_array[i].va_addr - src_bo->it.start; - /* Release old fence if a later fence is created. If no - * new fence is created, then keep the preivous fence + /* Copy src_bo to one or multiple dst_bo(s) based on size and + * and current copy location. */ - if (fence) { - err = kfd_fence_put_wait_if_diff_context(fence, - lfence); + while (j < args->dst_mem_array_size) { + uint64_t copy_size; + int64_t space_left; + + /* Find the current copy_size. This will be smaller of + * the following + * - space left in the current dest memory range + * - data left to copy from source range + */ + space_left = (dst_array[j].va_addr + dst_array[j].size) + - dst_va_addr; + copy_size = (src_size_to_copy < space_left) ? + src_size_to_copy : space_left; + + /* Check both BOs belong to same device */ + if (src_bo->dev->kgd != dst_bo->dev->kgd) { + pr_err("CMA %s fail. Not same dev\n", cma_op); + err = -EINVAL; + break; + } + + /* Store prev fence. Release it when a later fence is + * created + */ lfence = fence; - if (err) + fence = NULL; + + err = dst_bo->dev->kfd2kgd->copy_mem_to_mem( + src_bo->dev->kgd, + src_bo->mem, src_offset, + dst_bo->mem, dst_offset, + copy_size, + &fence, &copied); + + if (err) { + pr_err("GPU CMA %s failed\n", cma_op); + break; + } + + /* Later fence available. Release old fence */ + if (fence && lfence) { + dma_fence_put(lfence); + lfence = NULL; + } + + total_copied += copied; + src_size_to_copy -= copied; + space_left -= copied; + dst_va_addr += copied; + dst_offset += copied; + src_offset += copied; + if (dst_va_addr > dst_bo->it.last + 1) { + pr_err("CMA %s fail. Mem overflow\n", cma_op); + err = -EFAULT; + break; + } + + /* If the cur dest range is full move to next one */ + if (space_left <= 0) { + if (++j >= args->dst_mem_array_size) + break; + + dst_va_addr = dst_array[j].va_addr; + dst_va_addr_end = dst_va_addr + + dst_array[j].size - 1; + dst_bo = kfd_process_find_bo_from_interval( + dst_p, + dst_va_addr, + dst_va_addr_end); + if (!dst_bo || + dst_va_addr_end > dst_bo->it.last) { + pr_err("CMA %s failed. Invalid dst range\n", + cma_op); + err = -EFAULT; + break; + } + dst_offset = dst_va_addr - dst_bo->it.start; + } + + /* If the cur src range is done, move to next one */ + if (src_size_to_copy <= 0) break; } + if (err) + break; } /* Wait for the last fence irrespective of error condition */ - if (lfence) { - err = kfd_cma_fence_wait(lfence); - dma_fence_put(lfence); - if (err) + if (fence) { + if (dma_fence_wait_timeout(fence, false, msecs_to_jiffies(1000)) + < 0) pr_err("CMA %s failed. BO timed out\n", cma_op); + dma_fence_put(fence); + } else if (lfence) { + pr_debug("GPU copy fail. But wait for prev DMA to finish\n"); + dma_fence_wait_timeout(lfence, true, msecs_to_jiffies(1000)); + dma_fence_put(lfence); } - kfd_free_cma_bos(&si); - kfd_free_cma_bos(&di); - kfd_process_fail: mmput(remote_mm); mm_access_fail: @@ -2530,21 +2012,6 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, kfd_ioctl_dbg_wave_control, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA, - kfd_ioctl_set_scratch_backing_va, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, - kfd_ioctl_get_tile_config, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, - kfd_ioctl_set_trap_handler, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, - kfd_ioctl_get_process_apertures_new, 0), - - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM, - kfd_ioctl_acquire_vm, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_GPU, kfd_ioctl_alloc_memory_of_gpu, 0), @@ -2557,15 +2024,30 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU, kfd_ioctl_unmap_memory_from_gpu, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_MEMORY_OF_SCRATCH, + kfd_ioctl_alloc_scratch_memory, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_CU_MASK, kfd_ioctl_set_cu_mask, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_PROCESS_DGPU_APERTURE, + kfd_ioctl_set_process_dgpu_aperture, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER, + kfd_ioctl_set_trap_handler, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW, + kfd_ioctl_get_process_apertures_new, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_DMABUF_INFO, kfd_ioctl_get_dmabuf_info, 0), AMDKFD_IOCTL_DEF(AMDKFD_IOC_IMPORT_DMABUF, kfd_ioctl_import_dmabuf, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG, + kfd_ioctl_get_tile_config, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_IPC_IMPORT_HANDLE, kfd_ioctl_ipc_import_handle, 0), @@ -2578,6 +2060,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_QUEUE_WAVE_STATE, kfd_ioctl_get_queue_wave_state, 0), + AMDKFD_IOCTL_DEF(AMDKFD_IOC_ACQUIRE_VM, + kfd_ioctl_acquire_vm, 0) + }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -2673,33 +2158,34 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; - struct kfd_dev *dev = NULL; + struct kfd_dev *kfd; unsigned long vm_pgoff; - unsigned int gpu_id; + unsigned long long mmap_type; process = kfd_get_process(current); if (IS_ERR(process)) return PTR_ERR(process); vm_pgoff = vma->vm_pgoff; - vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vm_pgoff); - gpu_id = KFD_MMAP_GPU_ID_GET(vm_pgoff); - if (gpu_id) - dev = kfd_device_by_id(gpu_id); + vma->vm_pgoff = KFD_MMAP_OFFSET_VALUE_GET(vma->vm_pgoff); + mmap_type = vm_pgoff & KFD_MMAP_TYPE_MASK; - switch (vm_pgoff & KFD_MMAP_TYPE_MASK) { + switch (mmap_type) { case KFD_MMAP_TYPE_DOORBELL: - if (!dev) - return -ENODEV; - return kfd_doorbell_mmap(dev, process, vma); + kfd = kfd_device_by_id(KFD_MMAP_GPU_ID_GET(vm_pgoff)); + if (!kfd) + return -EFAULT; + return kfd_doorbell_mmap(kfd, process, vma); case KFD_MMAP_TYPE_EVENTS: return kfd_event_mmap(process, vma); case KFD_MMAP_TYPE_RESERVED_MEM: - if (!dev) - return -ENODEV; - return kfd_reserved_mem_mmap(dev, process, vma); + return kfd_reserved_mem_mmap(process, vma); + + default: + pr_err("Unsupported kfd mmap type %llx\n", mmap_type); + break; } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index c540b65..24d0634 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1,27 +1,7 @@ -/* - * Copyright 2015-2017 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include +#include #include +#include +#include #include "kfd_crat.h" #include "kfd_priv.h" #include "kfd_topology.h" @@ -286,7 +266,6 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, id = cache->processor_id_low; - pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); list_for_each_entry(dev, device_list, list) { total_num_of_cu = (dev->node_props.array_count * dev->node_props.cu_per_simd_array); @@ -436,15 +415,11 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, ret = kfd_parse_subtype_cache(cache, device_list); break; case CRAT_SUBTYPE_TLB_AFFINITY: - /* - * For now, nothing to do here - */ + /* For now, nothing to do here */ pr_debug("Found TLB entry in CRAT table (not processing)\n"); break; case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: - /* - * For now, nothing to do here - */ + /* For now, nothing to do here */ pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n"); break; case CRAT_SUBTYPE_IOLINK_AFFINITY: @@ -469,8 +444,9 @@ static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, * * Return - 0 if successful else -ve value */ -int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, - uint32_t proximity_domain) +int kfd_parse_crat_table(void *crat_image, + struct list_head *device_list, + uint32_t proximity_domain) { struct kfd_topology_device *top_dev = NULL; struct crat_subtype_generic *sub_type_hdr; @@ -642,7 +618,6 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); break; case CHIP_VEGA10: - case CHIP_VEGA20: pcache_info = vega10_cache_info; num_of_cache_types = ARRAY_SIZE(vega10_cache_info); break; @@ -718,7 +693,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, * crat_image will be NULL * @size: [OUT] size of crat_image * - * Return 0 if successful else return error code + * Return 0 if successful else return -ve value */ #ifdef CONFIG_ACPI int kfd_create_crat_image_acpi(void **crat_image, size_t *size) @@ -750,8 +725,10 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) } pcrat_image = kmalloc(crat_table->length, GFP_KERNEL); - if (!pcrat_image) + if (!pcrat_image) { + pr_err("No memory for allocating CRAT image\n"); return -ENOMEM; + } memcpy(pcrat_image, crat_table, crat_table->length); @@ -938,7 +915,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) #ifdef CONFIG_ACPI status = acpi_get_table("DSDT", 0, &acpi_table); - if (status != AE_OK) + if (status == AE_NOT_FOUND) pr_warn("DSDT table not found for OEM information\n"); else { crat_table->oem_revision = acpi_table->revision; @@ -1095,8 +1072,8 @@ static int kfd_fill_gpu_direct_io_link(int *avail_size, * [OUT] actual size of data filled in crat_image */ static int kfd_create_vcrat_image_gpu(void *pcrat_image, - size_t *size, struct kfd_dev *kdev, - uint32_t proximity_domain) + size_t *size, struct kfd_dev *kdev, + uint32_t proximity_domain) { struct crat_header *crat_table = (struct crat_header *)pcrat_image; struct crat_subtype_generic *sub_type_hdr; @@ -1264,8 +1241,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, * Return 0 if successful else return -ve value */ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, - uint32_t proximity_domain) + int flags, struct kfd_dev *kdev, uint32_t proximity_domain) { void *pcrat_image = NULL; int ret = 0; @@ -1295,8 +1271,8 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, if (!pcrat_image) return -ENOMEM; *size = VCRAT_SIZE_FOR_GPU; - ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, - proximity_domain); + ret = kfd_create_vcrat_image_gpu(pcrat_image, size, + kdev, proximity_domain); break; case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): /* TODO: */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index cd7ee6d..00de41f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -24,6 +24,7 @@ #define KFD_CRAT_H_INCLUDED #include +#include "kfd_priv.h" #pragma pack(1) @@ -227,12 +228,12 @@ struct crat_subtype_ccompute { /* * HSA IO Link Affinity structure and definitions */ -#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) -#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) -#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) -#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) -#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 +#define CRAT_IOLINK_FLAGS_ENABLED (1 << 0) +#define CRAT_IOLINK_FLAGS_NON_COHERENT (1 << 1) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT (1 << 2) +#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT (1 << 3) +#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA (1 << 4) +#define CRAT_IOLINK_FLAGS_RESERVED_MASK 0xffffffe0 /* * IO interface types @@ -240,18 +241,18 @@ struct crat_subtype_ccompute { #define CRAT_IOLINK_TYPE_UNDEFINED 0 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT 1 #define CRAT_IOLINK_TYPE_PCIEXPRESS 2 -#define CRAT_IOLINK_TYPE_AMBA 3 -#define CRAT_IOLINK_TYPE_MIPI 4 -#define CRAT_IOLINK_TYPE_QPI_1_1 5 -#define CRAT_IOLINK_TYPE_RESERVED1 6 -#define CRAT_IOLINK_TYPE_RESERVED2 7 -#define CRAT_IOLINK_TYPE_RAPID_IO 8 -#define CRAT_IOLINK_TYPE_INFINIBAND 9 -#define CRAT_IOLINK_TYPE_RESERVED3 10 -#define CRAT_IOLINK_TYPE_OTHER 11 -#define CRAT_IOLINK_TYPE_MAX 255 - -#define CRAT_IOLINK_RESERVED_LENGTH 24 +#define CRAT_IOLINK_TYPE_AMBA 3 +#define CRAT_IOLINK_TYPE_MIPI 4 +#define CRAT_IOLINK_TYPE_QPI_1_1 5 +#define CRAT_IOLINK_TYPE_RESERVED1 6 +#define CRAT_IOLINK_TYPE_RESERVED2 7 +#define CRAT_IOLINK_TYPE_RAPID_IO 8 +#define CRAT_IOLINK_TYPE_INFINIBAND 9 +#define CRAT_IOLINK_TYPE_RESERVED3 10 +#define CRAT_IOLINK_TYPE_OTHER 11 +#define CRAT_IOLINK_TYPE_MAX 255 + +#define CRAT_IOLINK_RESERVED_LENGTH 24 struct crat_subtype_iolink { uint8_t type; @@ -307,16 +308,13 @@ struct cdit_header { #pragma pack() -struct kfd_dev; - #ifdef CONFIG_ACPI int kfd_create_crat_image_acpi(void **crat_image, size_t *size); #endif void kfd_destroy_crat_image(void *crat_image); -int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, - uint32_t proximity_domain); +int kfd_parse_crat_table(void *crat_image, + struct list_head *device_list, + uint32_t proximity_domain); int kfd_create_crat_image_virtual(void **crat_image, size_t *size, - int flags, struct kfd_dev *kdev, - uint32_t proximity_domain); - + int flags, struct kfd_dev *kdev, uint32_t proximity_domain); #endif /* KFD_CRAT_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c index ab37d36..232e28f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2017 Advanced Micro Devices, Inc. + * Copyright 2014 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -21,8 +21,6 @@ */ #include -#include - #include "kfd_priv.h" static struct dentry *debugfs_root; @@ -34,38 +32,6 @@ static int kfd_debugfs_open(struct inode *inode, struct file *file) return single_open(file, show, NULL); } -static ssize_t kfd_debugfs_hang_hws_write(struct file *file, - const char __user *user_buf, size_t size, loff_t *ppos) -{ - struct kfd_dev *dev; - char tmp[16]; - uint32_t gpu_id; - int ret = -EINVAL; - - memset(tmp, 0, 16); - if (size >= 16) { - pr_err("Invalid input for gpu id.\n"); - goto out; - } - if (copy_from_user(tmp, user_buf, size)) { - ret = -EFAULT; - goto out; - } - if (kstrtoint(tmp, 10, &gpu_id)) { - pr_err("Invalid input for gpu id.\n"); - goto out; - } - dev = kfd_device_by_id(gpu_id); - if (dev) { - kfd_debugfs_hang_hws(dev); - ret = size; - } else - pr_err("Cannot find device %d.\n", gpu_id); - -out: - return ret; -} - static const struct file_operations kfd_debugfs_fops = { .owner = THIS_MODULE, .open = kfd_debugfs_open, @@ -74,15 +40,6 @@ static const struct file_operations kfd_debugfs_fops = { .release = single_release, }; -static const struct file_operations kfd_debugfs_hang_hws_fops = { - .owner = THIS_MODULE, - .open = kfd_debugfs_open, - .read = seq_read, - .write = kfd_debugfs_hang_hws_write, - .llseek = seq_lseek, - .release = single_release, -}; - void kfd_debugfs_init(void) { struct dentry *ent; @@ -108,11 +65,6 @@ void kfd_debugfs_init(void) ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, kfd_debugfs_rls_by_device, &kfd_debugfs_fops); - - ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, - NULL, - &kfd_debugfs_hang_hws_fops); - if (!ent) pr_warn("Failed to create rls in kfd debugfs\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c old mode 100644 new mode 100755 index 10095087..a9ad2a8 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -27,17 +27,12 @@ #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_pm4_headers_vi.h" -#include "cwsr_trap_handler.h" +#include "cwsr_trap_handler_gfx8.asm" +#include "cwsr_trap_handler_gfx9.asm" #include "kfd_iommu.h" #define MQD_SIZE_ALIGNED 768 - -/* - * kfd_locked is used to lock the kfd driver during suspend or reset - * once locked, kfd driver will stop any further GPU execution. - * create process (open) will return -EAGAIN. - */ -static atomic_t kfd_locked = ATOMIC_INIT(0); +static atomic_t kfd_device_suspended = ATOMIC_INIT(0); #ifdef KFD_SUPPORT_IOMMU_V2 static const struct kfd_device_info kaveri_device_info = { @@ -54,7 +49,6 @@ static const struct kfd_device_info kaveri_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info carrizo_device_info = { @@ -71,7 +65,6 @@ static const struct kfd_device_info carrizo_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info raven_device_info = { @@ -87,7 +80,6 @@ static const struct kfd_device_info raven_device_info = { .needs_iommu_device = true, .needs_pci_atomics = true, .num_sdma_engines = 1, - .num_sdma_queues_per_engine = 2, }; #endif @@ -105,7 +97,6 @@ static const struct kfd_device_info hawaii_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info tonga_device_info = { @@ -121,7 +112,6 @@ static const struct kfd_device_info tonga_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info tonga_vf_device_info = { @@ -137,7 +127,6 @@ static const struct kfd_device_info tonga_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_device_info = { @@ -153,7 +142,6 @@ static const struct kfd_device_info fiji_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_vf_device_info = { @@ -169,7 +157,6 @@ static const struct kfd_device_info fiji_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; @@ -186,7 +173,6 @@ static const struct kfd_device_info polaris10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris10_vf_device_info = { @@ -202,7 +188,6 @@ static const struct kfd_device_info polaris10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris11_device_info = { @@ -218,7 +203,6 @@ static const struct kfd_device_info polaris11_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_device_info = { @@ -232,9 +216,8 @@ static const struct kfd_device_info vega10_device_info = { .mqd_size_aligned = MQD_SIZE_ALIGNED, .supports_cwsr = true, .needs_iommu_device = false, - .needs_pci_atomics = false, + .needs_pci_atomics = true, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_vf_device_info = { @@ -250,23 +233,6 @@ static const struct kfd_device_info vega10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 2, -}; - -static const struct kfd_device_info vega20_device_info = { - .asic_family = CHIP_VEGA20, - .max_pasid_bits = 16, - .max_no_of_hqd = 24, - .doorbell_size = 8, - .ih_ring_entry_size = 8 * sizeof(uint32_t), - .event_interrupt_class = &event_interrupt_class_v9, - .num_of_watch_points = 4, - .mqd_size_aligned = MQD_SIZE_ALIGNED, - .supports_cwsr = true, - .needs_iommu_device = false, - .needs_pci_atomics = true, - .num_sdma_engines = 2, - .num_sdma_queues_per_engine = 8, }; struct kfd_deviceid { @@ -317,35 +283,35 @@ static const struct kfd_deviceid supported_devices[] = { { 0x67B9, &hawaii_device_info }, /* Hawaii */ { 0x67BA, &hawaii_device_info }, /* Hawaii */ { 0x67BE, &hawaii_device_info }, /* Hawaii */ - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ + { 0x6920, &tonga_device_info }, /* Tonga */ + { 0x6921, &tonga_device_info }, /* Tonga */ + { 0x6928, &tonga_device_info }, /* Tonga */ + { 0x6929, &tonga_device_info }, /* Tonga */ + { 0x692B, &tonga_device_info }, /* Tonga */ + { 0x692F, &tonga_vf_device_info }, /* Tonga vf */ + { 0x6938, &tonga_device_info }, /* Tonga */ + { 0x6939, &tonga_device_info }, /* Tonga */ + { 0x7300, &fiji_device_info }, /* Fiji */ + { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ + { 0x67C0, &polaris10_device_info }, /* Polaris10 */ + { 0x67C1, &polaris10_device_info }, /* Polaris10 */ + { 0x67C2, &polaris10_device_info }, /* Polaris10 */ { 0x67C4, &polaris10_device_info }, /* Polaris10 */ { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ + { 0x67C8, &polaris10_device_info }, /* Polaris10 */ + { 0x67C9, &polaris10_device_info }, /* Polaris10 */ + { 0x67CA, &polaris10_device_info }, /* Polaris10 */ + { 0x67CC, &polaris10_device_info }, /* Polaris10 */ + { 0x67CF, &polaris10_device_info }, /* Polaris10 */ + { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ + { 0x67E0, &polaris11_device_info }, /* Polaris11 */ + { 0x67E1, &polaris11_device_info }, /* Polaris11 */ { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ + { 0x67E7, &polaris11_device_info }, /* Polaris11 */ + { 0x67E8, &polaris11_device_info }, /* Polaris11 */ + { 0x67E9, &polaris11_device_info }, /* Polaris11 */ + { 0x67EB, &polaris11_device_info }, /* Polaris11 */ { 0x67EF, &polaris11_device_info }, /* Polaris11 */ { 0x67FF, &polaris11_device_info }, /* Polaris11 */ { 0x6860, &vega10_device_info }, /* Vega10 */ @@ -357,12 +323,6 @@ static const struct kfd_deviceid supported_devices[] = { { 0x6868, &vega10_device_info }, /* Vega10 */ { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ { 0x687F, &vega10_device_info }, /* Vega10 */ - { 0x66a0, &vega20_device_info }, /* Vega20 */ - { 0x66a1, &vega20_device_info }, /* Vega20 */ - { 0x66a2, &vega20_device_info }, /* Vega20 */ - { 0x66a3, &vega20_device_info }, /* Vega20 */ - { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info } /* Vega20 */ }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -392,7 +352,7 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) { struct kfd_dev *kfd; - int ret; + const struct kfd_device_info *device_info = lookup_device_info(pdev->device); @@ -400,27 +360,24 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, dev_err(kfd_device, "kgd2kfd_probe failed\n"); return NULL; } - + + if (device_info->needs_pci_atomics) { + /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. + * 32 and 64-bit requests are possible and must be + * supported. + */ + if (pci_enable_atomic_ops_to_root(pdev) < 0) { + dev_info(kfd_device, + "skipped device %x:%x, PCI rejects atomics", + pdev->vendor, pdev->device); + return NULL; + } + } + kfd = kzalloc(sizeof(*kfd), GFP_KERNEL); if (!kfd) return NULL; - /* Allow BIF to recode atomics to PCIe 3.0 AtomicOps. - * 32 and 64-bit requests are possible and must be - * supported. - */ - ret = pci_enable_atomic_ops_to_root(pdev, - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64); - if (device_info->needs_pci_atomics && ret < 0) { - dev_info(kfd_device, - "skipped device %x:%x, PCI rejects atomics", - pdev->vendor, pdev->device); - kfree(kfd); - return NULL; - } else if (!ret) - kfd->pci_atomic_requested = true; - kfd->kgd = kgd; kfd->device_info = device_info; kfd->pdev = pdev; @@ -462,6 +419,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, KGD_ENGINE_SDMA1); kfd->shared_resources = *gpu_resources; + /* Usually first_vmid_kfd = 8, last_vmid_kfd = 15 */ kfd->vm_info.first_vmid_kfd = ffs(gpu_resources->compute_vmid_bitmap)-1; kfd->vm_info.last_vmid_kfd = fls(gpu_resources->compute_vmid_bitmap)-1; kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd @@ -498,8 +456,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, if (kfd->kfd2kgd->init_gtt_mem_allocation( kfd->kgd, size, &kfd->gtt_mem, - &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, - false)) { + &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)){ dev_err(kfd_device, "Could not allocate %d bytes\n", size); goto out; } @@ -592,52 +549,21 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) int kgd2kfd_pre_reset(struct kfd_dev *kfd) { - if (!kfd->init_complete) - return 0; - kgd2kfd_suspend(kfd); - - /* hold dqm->lock to prevent further execution*/ - mutex_lock(&kfd->dqm->lock); - - kfd_signal_reset_event(kfd); return 0; } -/* - * Fix me. KFD won't be able to resume existing process for now. - * We will keep all existing process in a evicted state and - * wait the process to be terminated. - */ - int kgd2kfd_post_reset(struct kfd_dev *kfd) { - int ret, count; - - if (!kfd->init_complete) - return 0; - - mutex_unlock(&kfd->dqm->lock); - - ret = kfd_resume(kfd); - if (ret) - return ret; - count = atomic_dec_return(&kfd_locked); - WARN_ONCE(count != 0, "KFD reset ref. error"); return 0; } -bool kfd_is_locked(void) -{ - return (atomic_read(&kfd_locked) > 0); -} - void kgd2kfd_suspend(struct kfd_dev *kfd) { if (!kfd->init_complete) return; /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_locked) == 1) + if (atomic_inc_return(&kfd_device_suspended) == 1) kfd_suspend_all_processes(); kfd->dqm->ops.stop(kfd->dqm); @@ -656,7 +582,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd) if (ret) return ret; - count = atomic_dec_return(&kfd_locked); + count = atomic_dec_return(&kfd_device_suspended); WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); if (count == 0) ret = kfd_resume_all_processes(); @@ -704,19 +630,19 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) spin_lock(&kfd->interrupt_lock); - if (kfd->interrupts_active - && interrupt_is_wanted(kfd, ih_ring_entry, - patched_ihre, &is_patched) + if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, + patched_ihre, &is_patched) && enqueue_ih_ring_entry(kfd, - is_patched ? patched_ihre : ih_ring_entry)) + is_patched ? patched_ihre : ih_ring_entry)) queue_work(kfd->ih_wq, &kfd->interrupt_work); spin_unlock(&kfd->interrupt_lock); } -int kgd2kfd_quiesce_mm(struct mm_struct *mm) +int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm) { struct kfd_process *p; + struct kfd_process_device *pdd; int r; /* Because we are called from arbitrary context (workqueue) as opposed @@ -725,17 +651,26 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) */ p = kfd_lookup_process_by_mm(mm); if (!p) - return -ESRCH; + return -ENODEV; - r = kfd_process_evict_queues(p); + if (kfd) { + r = -ENODEV; + pdd = kfd_get_process_device_data(kfd, p); + if (pdd) + r = kfd->dqm->ops.evict_process_queues(kfd->dqm, + &pdd->qpd); + } else { + r = kfd_process_evict_queues(p); + } kfd_unref_process(p); return r; } -int kgd2kfd_resume_mm(struct mm_struct *mm) +int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm) { struct kfd_process *p; + struct kfd_process_device *pdd; int r; /* Because we are called from arbitrary context (workqueue) as opposed @@ -744,9 +679,17 @@ int kgd2kfd_resume_mm(struct mm_struct *mm) */ p = kfd_lookup_process_by_mm(mm); if (!p) - return -ESRCH; + return -ENODEV; - r = kfd_process_restore_queues(p); + if (kfd) { + r = -ENODEV; + pdd = kfd_get_process_device_data(kfd, p); + if (pdd) + r = kfd->dqm->ops.restore_process_queues(kfd->dqm, + &pdd->qpd); + } else { + r = kfd_process_restore_queues(p); + } kfd_unref_process(p); return r; @@ -981,26 +924,3 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) kfree(mem_obj); return 0; } - -#if defined(CONFIG_DEBUG_FS) - -/* This function will send a package to HIQ to hang the HWS - * which will trigger a GPU reset and bring the HWS back to normal state - */ -int kfd_debugfs_hang_hws(struct kfd_dev *dev) -{ - int r = 0; - - if (dev->dqm->sched_policy != KFD_SCHED_POLICY_HWS) { - pr_err("HWS is not enabled"); - return -EINVAL; - } - - r = pm_debugfs_hang_hws(&dev->dqm->packets); - if (!r) - r = dqm_debugfs_execute_queues(dev->dqm); - - return r; -} - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ae6f7d8..8c04f7a2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -21,11 +21,10 @@ * */ -#include -#include #include #include #include +#include #include #include #include "kfd_priv.h" @@ -61,8 +60,6 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, static void deallocate_sdma_queue(struct device_queue_manager *dqm, unsigned int sdma_queue_id); -static void kfd_process_hw_exception(struct work_struct *work); - static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { @@ -109,7 +106,7 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { return dqm->dev->device_info->num_sdma_engines - * dqm->dev->device_info->num_sdma_queues_per_engine; + * KFD_SDMA_QUEUES_PER_ENGINE; } void program_sh_mem_settings(struct device_queue_manager *dqm, @@ -200,7 +197,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->kgd, qpd->vmid, qpd->page_table_base); - /* invalidate the VM context after pasid and vmid mapping is set up */ + /*invalidate the VM context after pasid and vmid mapping is set up*/ kfd_flush_tlb(qpd_to_pdd(qpd)); return 0; @@ -209,19 +206,16 @@ static int allocate_vmid(struct device_queue_manager *dqm, static int flush_texture_cache_nocpsch(struct kfd_dev *kdev, struct qcm_process_device *qpd) { - const struct packet_manager_funcs *pmf = qpd->dqm->packets.pmf; - int ret; + uint32_t len; if (!qpd->ib_kaddr) return -ENOMEM; - ret = pmf->release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr); - if (ret) - return ret; + len = qpd->dqm->packets.pmf->release_mem(qpd->ib_base, + (uint32_t *)qpd->ib_kaddr); return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid, - qpd->ib_base, (uint32_t *)qpd->ib_kaddr, - pmf->release_mem_size / sizeof(uint32_t)); + qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len); } static void deallocate_vmid(struct device_queue_manager *dqm, @@ -290,6 +284,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (retval) { if (list_empty(&qpd->queues_list)) deallocate_vmid(dqm, qpd, q); + goto out_unlock; } @@ -359,10 +354,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { int retval; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (!mqd) return -ENOMEM; retval = allocate_hqd(dqm, q); @@ -373,7 +368,7 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (retval) goto out_deallocate_hqd; - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; @@ -387,15 +382,15 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, if (!q->properties.is_active) return 0; - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, q->process->mm); + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, + q->process->mm); if (retval) goto out_uninit_mqd; return 0; out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_hqd: @@ -412,11 +407,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) + if (!mqd) return -ENOMEM; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { @@ -433,14 +428,14 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, deallocate_doorbell(qpd, q); - retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval == -ETIME) qpd->reset_wavefronts = true; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); list_del(&q->list); if (list_empty(&qpd->queues_list)) { @@ -480,19 +475,21 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, static int update_queue(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; struct kfd_process_device *pdd; + bool prev_active = false; mutex_lock(&dqm->lock); + pdd = kfd_get_process_device_data(q->device, q->process); if (!pdd) { retval = -ENODEV; goto out_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { + if (!mqd) { retval = -ENOMEM; goto out_unlock; } @@ -500,7 +497,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) * Eviction state logic: we only mark active queues as evicted * to avoid the overhead of restoring inactive queues later */ - if (pdd->qpd.evicted) + if (pdd->qpd.evicted > 0) q->properties.is_evicted = (q->properties.queue_size > 0 && q->properties.queue_percent > 0 && q->properties.queue_address != 0); @@ -519,7 +516,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) { - retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval) { @@ -528,7 +525,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } - retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); + retval = mqd->update_mqd(mqd, q->mqd, &q->properties); /* * check active state vs. the previous state and modify @@ -546,7 +543,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || q->properties.type == KFD_QUEUE_TYPE_SDMA)) - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); out_unlock: @@ -557,29 +554,29 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) static struct mqd_manager *get_mqd_manager( struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) { - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) return NULL; pr_debug("mqd type %d\n", type); - mqd_mgr = dqm->mqd_mgrs[type]; - if (!mqd_mgr) { - mqd_mgr = mqd_manager_init(type, dqm->dev); - if (!mqd_mgr) + mqd = dqm->mqds[type]; + if (!mqd) { + mqd = mqd_manager_init(type, dqm->dev); + if (!mqd) pr_err("mqd manager is NULL"); - dqm->mqd_mgrs[type] = mqd_mgr; + dqm->mqds[type] = mqd; } - return mqd_mgr; + return mqd; } static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; struct kfd_process_device *pdd; int retval = 0; @@ -595,16 +592,16 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_active) continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ + if (!mqd) { /* should not be here */ pr_err("Cannot evict queue, mqd mgr is NULL\n"); retval = -ENOMEM; goto out; } q->properties.is_evicted = true; q->properties.is_active = false; - retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, + retval = mqd->destroy_mqd(mqd, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval) @@ -654,9 +651,9 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; struct kfd_process_device *pdd; - uint64_t pd_base; + uint32_t pd_base; int retval = 0; pdd = qpd_to_pdd(qpd); @@ -676,7 +673,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%llx\n", pd_base); + pr_debug("Updated PD address to 0x%08x\n", pd_base); if (!list_empty(&qpd->queues_list)) { dqm->dev->kfd2kgd->set_vm_context_page_table_base( @@ -690,16 +687,16 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_evicted) continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ + if (!mqd) { /* should not be here */ pr_err("Cannot restore queue, mqd mgr is NULL\n"); retval = -ENOMEM; goto out; } q->properties.is_evicted = false; q->properties.is_active = true; - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, + retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties, q->process->mm); if (retval) @@ -717,7 +714,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, { struct queue *q; struct kfd_process_device *pdd; - uint64_t pd_base; + uint32_t pd_base; int retval = 0; pdd = qpd_to_pdd(qpd); @@ -737,7 +734,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%llx\n", pd_base); + pr_debug("Updated PD address to 0x%08x\n", pd_base); /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { @@ -760,9 +757,9 @@ static int register_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct device_process_node *n; - struct kfd_process_device *pdd; - uint64_t pd_base; int retval; + struct kfd_process_device *pdd; + uint32_t pd_base; n = kzalloc(sizeof(*n), GFP_KERNEL); if (!n) @@ -779,7 +776,7 @@ static int register_process(struct device_queue_manager *dqm, /* Update PD Base in QPD */ qpd->page_table_base = pd_base; - pr_debug("Updated PD address to 0x%llx\n", pd_base); + pr_debug("Updated PD address to 0x%08x\n", pd_base); retval = dqm->asic_ops.update_qpd(dqm, qpd); @@ -880,7 +877,7 @@ static void uninitialize(struct device_queue_manager *dqm) kfree(dqm->allocated_queues); for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++) - kfree(dqm->mqd_mgrs[i]); + kfree(dqm->mqds[i]); mutex_destroy(&dqm->lock); kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); } @@ -888,7 +885,7 @@ static void uninitialize(struct device_queue_manager *dqm) static int start_nocpsch(struct device_queue_manager *dqm) { init_interrupts(dqm); - return pm_init(&dqm->packets, dqm); + return pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); } static int stop_nocpsch(struct device_queue_manager *dqm) @@ -924,11 +921,11 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; int retval; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); - if (!mqd_mgr) + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); + if (!mqd) return -ENOMEM; retval = allocate_sdma_queue(dqm, &q->sdma_id); @@ -947,20 +944,19 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties, - NULL); + retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL); if (retval) goto out_uninit_mqd; return 0; out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: @@ -1025,8 +1021,6 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->active_runlist = false; dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; - INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); - return 0; } @@ -1036,7 +1030,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; - retval = pm_init(&dqm->packets, dqm); + retval = pm_init(&dqm->packets, dqm, dqm->dev->mec_fw_version); if (retval) goto fail_packet_manager_init; @@ -1059,8 +1053,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); mutex_lock(&dqm->lock); - /* clear hang status when driver try to start the hw scheduler */ - dqm->is_hws_hang = false; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); mutex_unlock(&dqm->lock); @@ -1075,7 +1067,9 @@ static int start_cpsch(struct device_queue_manager *dqm) static int stop_cpsch(struct device_queue_manager *dqm) { mutex_lock(&dqm->lock); + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + mutex_unlock(&dqm->lock); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1136,7 +1130,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { int retval; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; retval = 0; @@ -1163,10 +1157,10 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (retval) goto out_deallocate_sdma_queue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { + if (!mqd) { retval = -ENOMEM; goto out_deallocate_doorbell; } @@ -1183,7 +1177,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); if (retval) goto out_deallocate_doorbell; @@ -1230,13 +1224,6 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, while (*fence_addr != fence_value) { if (time_after(jiffies, end_jiffies)) { pr_err("qcm fence wait loop timeout expired\n"); - /* In HWS case, this is used to halt the driver thread - * in order not to mess up CP states before doing - * scandumps for FW debugging. - */ - while (halt_if_hws_hang) - schedule(); - return -ETIME; } schedule(); @@ -1281,8 +1268,6 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { int retval = 0; - if (dqm->is_hws_hang) - return -EIO; if (!dqm->active_runlist) return retval; @@ -1321,13 +1306,9 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; - if (dqm->is_hws_hang) - return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param); if (retval) { pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); - dqm->is_hws_hang = true; - schedule_work(&dqm->hw_exception_work); return retval; } @@ -1339,7 +1320,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct queue *q) { int retval; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; bool preempt_all_queues; preempt_all_queues = false; @@ -1359,9 +1340,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { + if (!mqd) { retval = -ENOMEM; goto failed; } @@ -1382,7 +1363,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); /* * Unconditionally decrement this counter, regardless of the queue's @@ -1531,7 +1512,7 @@ static int get_wave_state(struct device_queue_manager *dqm, u32 *ctl_stack_used_size, u32 *save_area_used_size) { - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; int r; mutex_lock(&dqm->lock); @@ -1542,19 +1523,19 @@ static int get_wave_state(struct device_queue_manager *dqm, goto dqm_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) { + mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); + if (!mqd) { r = -ENOMEM; goto dqm_unlock; } - if (!mqd_mgr->get_wave_state) { + if (!mqd->get_wave_state) { r = -EINVAL; goto dqm_unlock; } - r = mqd_mgr->get_wave_state(mqd_mgr, q->mqd, ctl_stack, - ctl_stack_used_size, save_area_used_size); + r = mqd->get_wave_state(mqd, q->mqd, ctl_stack, ctl_stack_used_size, + save_area_used_size); dqm_unlock: mutex_unlock(&dqm->lock); @@ -1567,7 +1548,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, int retval; struct queue *q, *next; struct kernel_queue *kq, *kq_next; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; struct device_process_node *cur, *next_dpn; enum kfd_unmap_queues_filter filter = KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; @@ -1609,7 +1590,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, } retval = execute_queues_cpsch(dqm, filter, 0); - if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + if (retval || qpd->reset_wavefronts) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; @@ -1617,15 +1598,15 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, /* lastly, free mqd resources */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd_mgr = dqm->ops.get_mqd_manager(dqm, + mqd = dqm->ops.get_mqd_manager(dqm, get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { + if (!mqd) { retval = -ENOMEM; goto out; } list_del(&q->list); qpd->queue_count--; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); } out: @@ -1644,13 +1625,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) return NULL; switch (dev->device_info->asic_family) { - /* HWS is not available on Hawaii. */ case CHIP_HAWAII: - /* HWS depends on CWSR for timely dequeue. CWSR is not - * available on Tonga. - * - * FIXME: This argument also applies to Kaveri. - */ case CHIP_TONGA: dqm->sched_policy = KFD_SCHED_POLICY_NO_HWS; break; @@ -1729,9 +1704,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) break; case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: - device_queue_manager_init_v9(&dqm->asic_ops); + device_queue_manager_init_v9_vega10(&dqm->asic_ops); break; default: WARN(1, "Unexpected ASIC family %u", @@ -1770,13 +1744,6 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm, return ret; } -static void kfd_process_hw_exception(struct work_struct *work) -{ - struct device_queue_manager *dqm = container_of(work, - struct device_queue_manager, hw_exception_work); - dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd); -} - #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, @@ -1841,9 +1808,7 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) } for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { - for (queue = 0; - queue < dqm->dev->device_info->num_sdma_queues_per_engine; - queue++) { + for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) { r = dqm->dev->kfd2kgd->hqd_sdma_dump( dqm->dev->kgd, pipe, queue, &dump, &n_regs); if (r) @@ -1860,16 +1825,4 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) return r; } -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) -{ - int r = 0; - - mutex_lock(&dqm->lock); - dqm->active_runlist = true; - r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); - mutex_unlock(&dqm->lock); - - return r; -} - #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 1c4ef00..978458a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -31,6 +31,7 @@ #define KFD_UNMAP_LATENCY_MS (4000) #define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) +#define KFD_SDMA_QUEUES_PER_ENGINE (2) struct device_process_node { struct qcm_process_device *qpd; @@ -174,7 +175,7 @@ struct device_queue_manager { struct device_queue_manager_ops ops; struct device_queue_manager_asic_ops asic_ops; - struct mqd_manager *mqd_mgrs[KFD_MQD_TYPE_MAX]; + struct mqd_manager *mqds[KFD_MQD_TYPE_MAX]; struct packet_manager packets; struct kfd_dev *dev; struct mutex lock; @@ -194,10 +195,6 @@ struct device_queue_manager { struct kfd_mem_obj *fence_mem; bool active_runlist; int sched_policy; - - /* hw exception */ - bool is_hws_hang; - struct work_struct hw_exception_work; }; void device_queue_manager_init_cik( @@ -208,7 +205,7 @@ void device_queue_manager_init_vi( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_vi_tonga( struct device_queue_manager_asic_ops *asic_ops); -void device_queue_manager_init_v9( +void device_queue_manager_init_v9_vega10( struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); @@ -217,11 +214,18 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); +int process_evict_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +int process_restore_queues(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); + + static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { return (pdd->lds_base >> 16) & 0xFF; } +/* This function is only useful for GFXv7 and v8 */ static inline unsigned int get_sh_mem_bases_nybble_64(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 4175153..6198bf2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -32,7 +32,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm, static void init_sdma_vm_v9(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -void device_queue_manager_init_v9( +void device_queue_manager_init_v9_vega10( struct device_queue_manager_asic_ops *asic_ops) { asic_ops->update_qpd = update_qpd_v9; @@ -60,7 +60,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm, qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; - if (noretry && + if (vega10_noretry && !dqm->dev->device_info->needs_iommu_device) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index fd60a11..030b014 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -33,30 +33,26 @@ static bool set_cache_memory_policy_vi(struct device_queue_manager *dqm, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); +static int update_qpd_vi(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +/* + * Tonga device queue manager functions + */ static bool set_cache_memory_policy_vi_tonga(struct device_queue_manager *dqm, struct qcm_process_device *qpd, enum cache_policy default_policy, enum cache_policy alternate_policy, void __user *alternate_aperture_base, uint64_t alternate_aperture_size); -static int update_qpd_vi(struct device_queue_manager *dqm, - struct qcm_process_device *qpd); static int update_qpd_vi_tonga(struct device_queue_manager *dqm, struct qcm_process_device *qpd); -static void init_sdma_vm(struct device_queue_manager *dqm, struct queue *q, - struct qcm_process_device *qpd); static void init_sdma_vm_tonga(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); -void device_queue_manager_init_vi( - struct device_queue_manager_asic_ops *asic_ops) -{ - asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; - asic_ops->update_qpd = update_qpd_vi; - asic_ops->init_sdma_vm = init_sdma_vm; -} - void device_queue_manager_init_vi_tonga( struct device_queue_manager_asic_ops *asic_ops) { @@ -65,6 +61,15 @@ void device_queue_manager_init_vi_tonga( asic_ops->init_sdma_vm = init_sdma_vm_tonga; } + +void device_queue_manager_init_vi( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; + asic_ops->update_qpd = update_qpd_vi; + asic_ops->init_sdma_vm = init_sdma_vm; +} + static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) { /* In 64-bit mode, we can only control the top 3 bits of the LDS, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index ebe79bf..fc41689 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -115,7 +115,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd) pr_debug("doorbell aperture size == 0x%08lX\n", kfd->shared_resources.doorbell_aperture_size); - pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr); + pr_debug("doorbell kernel address == 0x%p\n", kfd->doorbell_kernel_ptr); return 0; } @@ -188,9 +188,9 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, *doorbell_off = kfd->doorbell_id_offset + inx; pr_debug("Get kernel queue doorbell\n" - " doorbell offset == 0x%08X\n" - " doorbell index == 0x%x\n", - *doorbell_off, inx); + " doorbell offset == 0x%08X\n" + " kernel address == 0x%p\n", + *doorbell_off, (kfd->doorbell_kernel_ptr + inx)); return kfd->doorbell_kernel_ptr + inx; } @@ -199,8 +199,7 @@ void kfd_release_kernel_doorbell(struct kfd_dev *kfd, u32 __iomem *db_addr) { unsigned int inx; - inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr) - * sizeof(u32) / kfd->device_info->doorbell_size; + inx = (unsigned int)(db_addr - kfd->doorbell_kernel_ptr); mutex_lock(&kfd->doorbell_mutex); __clear_bit(inx, kfd->doorbell_available_index); @@ -211,7 +210,7 @@ void write_kernel_doorbell(void __iomem *db, u32 value) { if (db) { writel(value, db); - pr_debug("Writing %d to doorbell address %p\n", value, db); + pr_debug("Writing %d to doorbell address 0x%p\n", value, db); } } @@ -221,10 +220,14 @@ void write_kernel_doorbell64(void __iomem *db, u64 value) WARN(((unsigned long)db & 7) != 0, "Unaligned 64-bit doorbell"); writeq(value, (u64 __iomem *)db); - pr_debug("writing %llu to doorbell address %p\n", value, db); + pr_debug("writing %llu to doorbell address 0x%p\n", value, db); } } +/* + * queue_ids are in the range [0,MAX_PROCESS_QUEUES) and are mapped 1:1 + * to doorbells with the process's doorbell page + */ unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, struct kfd_process *process, unsigned int doorbell_id) @@ -236,8 +239,7 @@ unsigned int kfd_doorbell_id_to_offset(struct kfd_dev *kfd, * units regardless of the ASIC-dependent doorbell size. */ return kfd->doorbell_id_offset + - process->doorbell_index - * kfd_doorbell_process_slice(kfd) / sizeof(u32) + + process->doorbell_index * (kfd_doorbell_process_slice(kfd)/sizeof(u32)) + doorbell_id * kfd->device_info->doorbell_size / sizeof(u32); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 1dc1584..a92ca78 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -51,8 +51,8 @@ struct kfd_event_waiter { */ struct kfd_signal_page { uint64_t *kernel_address; + uint64_t handle; uint64_t __user *user_address; - bool need_to_free_pages; }; @@ -80,7 +80,6 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) KFD_SIGNAL_EVENT_LIMIT * 8); page->kernel_address = backing_store; - page->need_to_free_pages = true; pr_debug("Allocated new event signal page at %p, for process %p\n", page, p); @@ -100,17 +99,9 @@ static int allocate_event_notification_slot(struct kfd_process *p, p->signal_page = allocate_signal_page(p); if (!p->signal_page) return -ENOMEM; - /* Oldest user mode expects 256 event slots */ - p->signal_mapped_size = 256*8; } - /* - * Compatibility with old user mode: Only use signal slots - * user mode has mapped, may be less than - * KFD_SIGNAL_EVENT_LIMIT. This also allows future increase - * of the event limit without breaking user mode. - */ - id = idr_alloc(&p->event_idr, ev, 0, p->signal_mapped_size / 8, + id = idr_alloc(&p->event_idr, ev, 0, KFD_SIGNAL_EVENT_LIMIT, GFP_KERNEL); if (id < 0) return id; @@ -121,6 +112,29 @@ static int allocate_event_notification_slot(struct kfd_process *p, return 0; } +static struct kfd_signal_page *allocate_signal_page_dgpu( + struct kfd_process *p, uint64_t *kernel_address, uint64_t handle) +{ + struct kfd_signal_page *my_page; + + my_page = kzalloc(sizeof(*my_page), GFP_KERNEL); + if (!my_page) + return NULL; + + /* Initialize all events to unsignaled */ + memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, + KFD_SIGNAL_EVENT_LIMIT * 8); + + my_page->kernel_address = kernel_address; + my_page->handle = handle; + my_page->user_address = NULL; + + pr_debug("Allocated new event signal page at %p, for process %p\n", + my_page, p); + + return my_page; +} + /* * Assumes that p->event_mutex is held and of course that p is not going * away (current or locked). @@ -184,8 +198,7 @@ static int create_signal_event(struct file *devkfd, { int ret; - if (p->signal_mapped_size && - p->signal_event_count == p->signal_mapped_size / 8) { + if (p->signal_event_count == KFD_SIGNAL_EVENT_LIMIT) { if (!p->signal_event_limit_reached) { pr_warn("Signal event wasn't created because limit was reached\n"); p->signal_event_limit_reached = true; @@ -271,9 +284,9 @@ static void shutdown_signal_page(struct kfd_process *p) struct kfd_signal_page *page = p->signal_page; if (page) { - if (page->need_to_free_pages) + if (page->user_address) free_pages((unsigned long)page->kernel_address, - get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); + get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); kfree(page); } } @@ -295,34 +308,11 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) return ev->type == KFD_EVENT_TYPE_SIGNAL; } -int kfd_event_page_set(struct kfd_process *p, void *kernel_address, - uint64_t size) -{ - struct kfd_signal_page *page; - - if (p->signal_page) - return -EBUSY; - - page = kzalloc(sizeof(*page), GFP_KERNEL); - if (!page) - return -ENOMEM; - - /* Initialize all events to unsignaled */ - memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT, - KFD_SIGNAL_EVENT_LIMIT * 8); - - page->kernel_address = kernel_address; - - p->signal_page = page; - p->signal_mapped_size = size; - - return 0; -} - int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index) + uint64_t *event_page_offset, uint32_t *event_slot_index, + void *kern_addr) { int ret = 0; struct kfd_event *ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -336,10 +326,19 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, init_waitqueue_head(&ev->wq); - *event_page_offset = 0; - mutex_lock(&p->event_mutex); + if (kern_addr && !p->signal_page) { + p->signal_page = allocate_signal_page_dgpu(p, kern_addr, + *event_page_offset); + if (!p->signal_page) { + ret = -ENOMEM; + goto out; + } + } + + *event_page_offset = 0; + switch (event_type) { case KFD_EVENT_TYPE_SIGNAL: case KFD_EVENT_TYPE_DEBUG: @@ -362,6 +361,7 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p, kfree(ev); } +out: mutex_unlock(&p->event_mutex); return ret; @@ -390,11 +390,7 @@ static void set_event(struct kfd_event *ev) { struct kfd_event_waiter *waiter; - /* Auto reset if the list is non-empty and we're waking - * someone. waitqueue_active is safe here because we're - * protected by the p->event_mutex, which is also held when - * updating the wait queues in kfd_wait_on_events. - */ + /* Auto reset if the list is non-empty and we're waking someone. */ ev->signaled = !ev->auto_reset || !waitqueue_active(&ev->wq); list_for_each_entry(waiter, &ev->wq.head, wait.entry) @@ -781,12 +777,12 @@ int kfd_wait_on_events(struct kfd_process *p, int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) { + unsigned long pfn; struct kfd_signal_page *page; - int ret; - /* check required size doesn't exceed the allocated size */ - if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) < + /* check required size is logical */ + if (get_order(KFD_SIGNAL_EVENT_LIMIT * 8) != get_order(vma->vm_end - vma->vm_start)) { pr_err("Event page mmap requested illegal size\n"); return -EINVAL; @@ -816,12 +812,8 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma) page->user_address = (uint64_t __user *)vma->vm_start; /* mapping the page to user process */ - ret = remap_pfn_range(vma, vma->vm_start, pfn, + return remap_pfn_range(vma, vma->vm_start, pfn, vma->vm_end - vma->vm_start, vma->vm_page_prot); - if (!ret) - p->signal_mapped_size = vma->vm_end - vma->vm_start; - - return ret; } /* @@ -1012,30 +1004,3 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, mutex_unlock(&p->event_mutex); kfd_unref_process(p); } - -void kfd_signal_reset_event(struct kfd_dev *dev) -{ - struct kfd_hsa_hw_exception_data hw_exception_data; - struct kfd_process *p; - struct kfd_event *ev; - unsigned int temp; - uint32_t id, idx; - - /* Whole gpu reset caused by GPU hang , and memory is lost */ - memset(&hw_exception_data, 0, sizeof(hw_exception_data)); - hw_exception_data.gpu_id = dev->id; - hw_exception_data.memory_lost = 1; - - idx = srcu_read_lock(&kfd_processes_srcu); - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - mutex_lock(&p->event_mutex); - id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) - if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { - ev->hw_exception_data = hw_exception_data; - set_event(ev); - } - mutex_unlock(&p->event_mutex); - } - srcu_read_unlock(&kfd_processes_srcu, idx); -} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_events.h index c7ac6c7..abca5bf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.h @@ -66,7 +66,6 @@ struct kfd_event { /* type specific data */ union { struct kfd_hsa_memory_exception_data memory_exception_data; - struct kfd_hsa_hw_exception_data hw_exception_data; }; }; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 8f123a2..2c00711 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -289,6 +289,7 @@ #define MAKE_LDS_APP_BASE_VI() \ (((uint64_t)(0x1UL) << 61) + 0x0) + #define MAKE_LDS_APP_LIMIT(base) \ (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) @@ -312,7 +313,17 @@ #define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE) #define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE) -static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) +int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, + uint64_t base, uint64_t limit) +{ + if (base < SVM_USER_BASE) { + pr_err("Set dgpu vm base 0x%llx failed.\n", base); + return -EINVAL; + } + return 0; +} + +void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) { /* * node id couldn't be 0 - the three MSB bits of @@ -321,42 +332,19 @@ static void kfd_init_apertures_vi(struct kfd_process_device *pdd, uint8_t id) pdd->lds_base = MAKE_LDS_APP_BASE_VI(); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - if (!pdd->dev->device_info->needs_iommu_device) { - /* dGPUs: SVM aperture starting at 0 - * with small reserved space for kernel. - * Set them to CANONICAL addresses. - */ - pdd->gpuvm_base = SVM_USER_BASE; - pdd->gpuvm_limit = - pdd->dev->shared_resources.gpuvm_size - 1; - } else { - /* set them to non CANONICAL addresses, and no SVM is - * allocated. - */ - pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); - pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base, - pdd->dev->shared_resources.gpuvm_size); - } + pdd->gpuvm_base = MAKE_GPUVM_APP_BASE_VI(id + 1); + pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT( + pdd->gpuvm_base, pdd->dev->shared_resources.gpuvm_size); pdd->scratch_base = MAKE_SCRATCH_APP_BASE_VI(); pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); } -static void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) +void kfd_init_apertures_v9(struct kfd_process_device *pdd, uint8_t id) { pdd->lds_base = MAKE_LDS_APP_BASE_V9(); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); - /* Raven needs SVM to support graphic handle, etc. Leave the small - * reserved space before SVM on Raven as well, even though we don't - * have to. - * Set gpuvm_base and gpuvm_limit to CANONICAL addresses so that they - * are used in Thunk to reserve SVM. - */ - pdd->gpuvm_base = SVM_USER_BASE; - pdd->gpuvm_limit = - pdd->dev->shared_resources.gpuvm_size - 1; - pdd->scratch_base = MAKE_SCRATCH_APP_BASE_V9(); pdd->scratch_limit = MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); } @@ -377,10 +365,10 @@ int kfd_init_apertures(struct kfd_process *process) pdd = kfd_create_process_device_data(dev, process); if (!pdd) { pr_err("Failed to create process device data\n"); - return -ENOMEM; + return -1; } /* - * For 64 bit process apertures will be statically reserved in + * For 64 bit process aperture will be statically reserved in * the x86_64 non canonical process address space * amdkfd doesn't currently support apertures for 32 bit process */ @@ -400,20 +388,21 @@ int kfd_init_apertures(struct kfd_process *process) kfd_init_apertures_vi(pdd, id); break; case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: kfd_init_apertures_v9(pdd, id); break; default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); - return -EINVAL; + pr_err("Unknown chip in kfd_init_apertures\n"); + return -1; } if (!dev->device_info->needs_iommu_device) { - /* dGPUs: the reserved space for kernel - * before SVM + /* dGPUs: SVM aperture starting at 0 + * with small reserved space for kernel */ + pdd->gpuvm_base = SVM_USER_BASE; + pdd->gpuvm_limit = + dev->shared_resources.gpuvm_size - 1; pdd->qpd.cwsr_base = SVM_CWSR_BASE; pdd->qpd.ib_base = SVM_IB_BASE; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index f836897..009d6f4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -25,43 +25,70 @@ #include "soc15_int.h" +static uint32_t kfd_get_pasid_from_vmid(struct kfd_dev *dev, uint8_t vmid) +{ + uint32_t pasid = 0; + const struct kfd2kgd_calls *f2g = dev->kfd2kgd; + + if (f2g->get_atc_vmid_pasid_mapping_valid(dev->kgd, vmid)) + pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); + + return pasid; +} + static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, bool *patched_flag) { uint16_t source_id, client_id, pasid, vmid; - const uint32_t *data = ih_ring_entry; + bool result = false; - /* Only handle interrupts from KFD VMIDs */ + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - if (vmid < dev->vm_info.first_vmid_kfd || - vmid > dev->vm_info.last_vmid_kfd) - return 0; - /* If there is no valid PASID, it's likely a firmware bug */ - pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); - if (WARN_ONCE(pasid == 0, "FW bug: No PASID in KFD interrupt")) - return 0; + if (pasid) { + const uint32_t *data = ih_ring_entry; - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", + client_id, source_id, pasid); + pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], + data[4], data[5], data[6], data[7]); + } + + if ((vmid >= dev->vm_info.first_vmid_kfd && + vmid <= dev->vm_info.last_vmid_kfd) && + (source_id == SOC15_INTSRC_CP_END_OF_PIPE || + source_id == SOC15_INTSRC_SDMA_TRAP || + source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || + source_id == SOC15_INTSRC_CP_BAD_OPCODE || + client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_UTCL2)) { + + /* + * KFD want to handle this INT, but MEC firmware did + * not send pasid. Try to get it from vmid mapping + * and patch the ih entry. It's a temp workaround. + */ + WARN_ONCE((!pasid), "Fix me.\n"); + if (!pasid) { + uint32_t temp = le32_to_cpu(ih_ring_entry[3]); + + pasid = kfd_get_pasid_from_vmid(dev, vmid); + memcpy(patched_ihre, ih_ring_entry, + dev->device_info->ih_ring_entry_size); + patched_ihre[3] = cpu_to_le32(temp | pasid); + *patched_flag = true; + } + result = pasid ? true : false; + } + + /* Do not process in ISR, just request it to be forwarded to WQ. */ + return result; - pr_debug("client id 0x%x, source id %d, pasid 0x%x. raw data:\n", - client_id, source_id, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); - - /* Interrupt types we care about: various signals and faults. - * They will be forwarded to a work queue (see below). - */ - return source_id == SOC15_INTSRC_CP_END_OF_PIPE || - source_id == SOC15_INTSRC_SDMA_TRAP || - source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || - source_id == SOC15_INTSRC_CP_BAD_OPCODE || - client_id == SOC15_IH_CLIENTID_VMC || - client_id == SOC15_IH_CLIENTID_UTCL2; } static void event_interrupt_wq_v9(struct kfd_dev *dev, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c index 7a61f38..5b798f9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -75,8 +75,7 @@ int kfd_iommu_device_init(struct kfd_dev *kfd) } if ((iommu_info.flags & required_iommu_flags) != required_iommu_flags) { - dev_err(kfd_device, - "error required iommu flags ats %i, pri %i, pasid %i\n", + dev_err(kfd_device, "error required iommu flags ats %i, pri %i, pasid %i\n", (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP) != 0, (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) != 0, (iommu_info.flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c index a53d954..97806ed 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_ipc.c @@ -140,7 +140,7 @@ static int kfd_import_dmabuf_create_kfd_bo(struct kfd_dev *dev, goto err_unlock; idr_handle = kfd_process_device_create_obj_handle(pdd, mem, - va_addr, size, 0, 0, + va_addr, size, ipc_obj); if (idr_handle < 0) { r = -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index e78445d..8cf9d44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -59,7 +59,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, switch (type) { case KFD_QUEUE_TYPE_DIQ: case KFD_QUEUE_TYPE_HIQ: - kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm, + kq->mqd = dev->dqm->ops.get_mqd_manager(dev->dqm, KFD_MQD_TYPE_HIQ); break; default: @@ -67,7 +67,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, return false; } - if (!kq->mqd_mgr) + if (!kq->mqd) return false; prop.doorbell_ptr = kfd_get_kernel_doorbell(dev, &prop.doorbell_off); @@ -131,7 +131,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->queue->device = dev; kq->queue->process = kfd_get_process(current); - retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, + retval = kq->mqd->init_mqd(kq->mqd, &kq->queue->mqd, &kq->queue->mqd_mem_obj, &kq->queue->gart_mqd_addr, &kq->queue->properties); @@ -143,9 +143,9 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, pr_debug("Assigning hiq to hqd\n"); kq->queue->pipe = KFD_CIK_HIQ_PIPE; kq->queue->queue = KFD_CIK_HIQ_QUEUE; - kq->mqd_mgr->load_mqd(kq->mqd_mgr, kq->queue->mqd, - kq->queue->pipe, kq->queue->queue, - &kq->queue->properties, NULL); + kq->mqd->load_mqd(kq->mqd, kq->queue->mqd, kq->queue->pipe, + kq->queue->queue, &kq->queue->properties, + NULL); } else { /* allocate fence for DIQ */ @@ -183,7 +183,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, static void uninitialize(struct kernel_queue *kq) { if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) - kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, + kq->mqd->destroy_mqd(kq->mqd, kq->queue->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, @@ -192,8 +192,7 @@ static void uninitialize(struct kernel_queue *kq) else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); - kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd, - kq->queue->mqd_mem_obj); + kq->mqd->uninit_mqd(kq->mqd, kq->queue->mqd, kq->queue->mqd_mem_obj); kfd_gtt_sa_free(kq->dev, kq->rptr_mem); kfd_gtt_sa_free(kq->dev, kq->wptr_mem); @@ -316,13 +315,7 @@ static void submit_packet(struct kernel_queue *kq) static void rollback_packet(struct kernel_queue *kq) { - if (kq->dev->device_info->doorbell_size == 8) { - kq->pending_wptr64 = *kq->wptr64_kernel; - kq->pending_wptr = *kq->wptr_kernel % - (kq->queue->properties.queue_size / 4); - } else { - kq->pending_wptr = *kq->wptr_kernel; - } + kq->pending_wptr = *kq->queue->properties.write_ptr; } struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, @@ -356,7 +349,6 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, break; case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: kernel_queue_init_v9(&kq->ops_asic_specific); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 384d7a3..82c94a6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -80,7 +80,7 @@ struct kernel_queue { /* data */ struct kfd_dev *dev; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd; struct queue *queue; uint64_t pending_wptr64; uint32_t pending_wptr; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c index 19e54ac..2808422 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_cik.c @@ -22,6 +22,8 @@ */ #include "kfd_kernel_queue.h" +#include "kfd_pm4_headers.h" +#include "kfd_pm4_opcodes.h" static bool initialize_cik(struct kernel_queue *kq, struct kfd_dev *dev, enum kfd_queue_type type, unsigned int queue_size); @@ -51,3 +53,120 @@ static void submit_packet_cik(struct kernel_queue *kq) write_kernel_doorbell(kq->queue->properties.doorbell_ptr, kq->pending_wptr); } + +static int pm_map_process_cik(struct packet_manager *pm, uint32_t *buffer, + struct qcm_process_device *qpd) +{ + struct pm4_map_process *packet; + + packet = (struct pm4_map_process *)buffer; + + memset(buffer, 0, sizeof(struct pm4_map_process)); + + packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields10.gds_size = qpd->gds_size; + packet->bitfields10.num_gws = qpd->num_gws; + packet->bitfields10.num_oac = qpd->num_oac; + packet->bitfields10.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static int pm_map_process_scratch_cik(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_map_process_scratch_kv *packet; + + packet = (struct pm4_map_process_scratch_kv *)buffer; + + memset(buffer, 0, sizeof(struct pm4_map_process_scratch_kv)); + + packet->header.u32all = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_map_process_scratch_kv)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields3.page_table_base = qpd->page_table_base; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + packet->sh_mem_ape1_base = qpd->sh_mem_ape1_base; + packet->sh_mem_ape1_limit = qpd->sh_mem_ape1_limit; + + packet->sh_hidden_private_base_vmid = qpd->sh_hidden_private_base; + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + return 0; +} + +static uint32_t pm_get_map_process_packet_size_cik(void) +{ + return sizeof(struct pm4_map_process); +} +static uint32_t pm_get_map_process_scratch_packet_size_cik(void) +{ + return sizeof(struct pm4_map_process_scratch_kv); +} + + +static struct packet_manager_funcs kfd_cik_pm_funcs = { + .map_process = pm_map_process_cik, + .runlist = pm_runlist_vi, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_vi, + .unmap_queues = pm_unmap_queues_vi, + .query_status = pm_query_status_vi, + .release_mem = pm_release_mem_vi, + .get_map_process_packet_size = pm_get_map_process_packet_size_cik, + .get_runlist_packet_size = pm_get_runlist_packet_size_vi, + .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, + .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, + .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, + .get_query_status_packet_size = pm_get_query_status_packet_size_vi, + .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, +}; + +static struct packet_manager_funcs kfd_cik_scratch_pm_funcs = { + .map_process = pm_map_process_scratch_cik, + .runlist = pm_runlist_vi, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_vi, + .unmap_queues = pm_unmap_queues_vi, + .query_status = pm_query_status_vi, + .release_mem = pm_release_mem_vi, + .get_map_process_packet_size = + pm_get_map_process_scratch_packet_size_cik, + .get_runlist_packet_size = pm_get_runlist_packet_size_vi, + .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, + .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, + .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, + .get_query_status_packet_size = pm_get_query_status_packet_size_vi, + .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, +}; + +void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver) +{ + if (fw_ver >= KFD_SCRATCH_KV_FW_VER) + pm->pmf = &kfd_cik_scratch_pm_funcs; + else + pm->pmf = &kfd_cik_pm_funcs; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c index 33830b1..5fe4f60 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -44,7 +44,7 @@ static bool initialize_v9(struct kernel_queue *kq, struct kfd_dev *dev, int retval; retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); - if (retval) + if (retval != 0) return false; kq->eop_gpu_addr = kq->eop_mem->gpu_addr; @@ -71,7 +71,8 @@ static int pm_map_process_v9(struct packet_manager *pm, uint32_t *buffer, struct qcm_process_device *qpd) { struct pm4_mes_map_process *packet; - uint64_t vm_page_table_base_addr = qpd->page_table_base; + uint64_t vm_page_table_base_addr = + (uint64_t)(qpd->page_table_base) << 12; packet = (struct pm4_mes_map_process *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_map_process)); @@ -125,6 +126,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, concurrent_proc_cnt = min(pm->dqm->processes_count, kfd->max_proc_per_quantum); + packet = (struct pm4_mes_runlist *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_runlist)); @@ -293,7 +295,7 @@ static int pm_query_status_v9(struct packet_manager *pm, uint32_t *buffer, } -static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) +static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) { struct pm4_mec_release_mem *packet; @@ -318,22 +320,58 @@ static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) packet->data_lo = 0; - return 0; + return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +} + +static uint32_t pm_get_map_process_packet_size_v9(void) +{ + return sizeof(struct pm4_mes_map_process); +} + +static uint32_t pm_get_runlist_packet_size_v9(void) +{ + return sizeof(struct pm4_mes_runlist); +} + +static uint32_t pm_get_map_queues_packet_size_v9(void) +{ + return sizeof(struct pm4_mes_map_queues); +} + +static uint32_t pm_get_unmap_queues_packet_size_v9(void) +{ + return sizeof(struct pm4_mes_unmap_queues); +} + +static uint32_t pm_get_query_status_packet_size_v9(void) +{ + return sizeof(struct pm4_mes_query_status); +} + +static uint32_t pm_get_release_mem_packet_size_v9(void) +{ + return sizeof(struct pm4_mec_release_mem); } -const struct packet_manager_funcs kfd_v9_pm_funcs = { - .map_process = pm_map_process_v9, - .runlist = pm_runlist_v9, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_v9, - .unmap_queues = pm_unmap_queues_v9, - .query_status = pm_query_status_v9, - .release_mem = pm_release_mem_v9, - .map_process_size = sizeof(struct pm4_mes_map_process), - .runlist_size = sizeof(struct pm4_mes_runlist), - .set_resources_size = sizeof(struct pm4_mes_set_resources), - .map_queues_size = sizeof(struct pm4_mes_map_queues), - .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .query_status_size = sizeof(struct pm4_mes_query_status), - .release_mem_size = sizeof(struct pm4_mec_release_mem) +static struct packet_manager_funcs kfd_v9_pm_funcs = { + .map_process = pm_map_process_v9, + .runlist = pm_runlist_v9, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_v9, + .unmap_queues = pm_unmap_queues_v9, + .query_status = pm_query_status_v9, + .release_mem = pm_release_mem_v9, + .get_map_process_packet_size = pm_get_map_process_packet_size_v9, + .get_runlist_packet_size = pm_get_runlist_packet_size_v9, + .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, + .get_map_queues_packet_size = pm_get_map_queues_packet_size_v9, + .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_v9, + .get_query_status_packet_size = pm_get_query_status_packet_size_v9, + .get_release_mem_packet_size = pm_get_release_mem_packet_size_v9, }; + +void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver) +{ + pm->pmf = &kfd_v9_pm_funcs; +} + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index bf20c6d..9022ecb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -67,25 +67,12 @@ static void submit_packet_vi(struct kernel_queue *kq) kq->pending_wptr); } -unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) -{ - union PM4_MES_TYPE_3_HEADER header; - - header.u32All = 0; - header.opcode = opcode; - header.count = packet_size / 4 - 2; - header.type = PM4_TYPE_3; - - return header.u32All; -} - -static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, - struct qcm_process_device *qpd) +static int pm_map_process_vi(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) { struct pm4_mes_map_process *packet; packet = (struct pm4_mes_map_process *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_map_process)); packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, @@ -112,16 +99,27 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, return 0; } -static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, + +unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size) +{ + union PM4_MES_TYPE_3_HEADER header; + + header.u32All = 0; + header.opcode = opcode; + header.count = packet_size / 4 - 2; + header.type = PM4_TYPE_3; + + return header.u32All; +} + +int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, uint64_t ib, size_t ib_size_in_dwords, bool chain) { struct pm4_mes_runlist *packet; + int concurrent_proc_cnt = 0; struct kfd_dev *kfd = pm->dqm->dev; - if (WARN_ON(!ib)) - return -EFAULT; - /* Determine the number of processes to map together to HW: * it can not exceed the number of VMIDs available to the * scheduler, and it is determined by the smaller of the number @@ -134,6 +132,7 @@ static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, concurrent_proc_cnt = min(pm->dqm->processes_count, kfd->max_proc_per_quantum); + packet = (struct pm4_mes_runlist *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_runlist)); @@ -151,35 +150,7 @@ static int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, return 0; } -int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res) -{ - struct pm4_mes_set_resources *packet; - - packet = (struct pm4_mes_set_resources *)buffer; - memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); - - packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, - sizeof(struct pm4_mes_set_resources)); - - packet->bitfields2.queue_type = - queue_type__mes_set_resources__hsa_interface_queue_hiq; - packet->bitfields2.vmid_mask = res->vmid_mask; - packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; - packet->bitfields7.oac_mask = res->oac_mask; - packet->bitfields8.gds_heap_base = res->gds_heap_base; - packet->bitfields8.gds_heap_size = res->gds_heap_size; - - packet->gws_mask_lo = lower_32_bits(res->gws_mask); - packet->gws_mask_hi = upper_32_bits(res->gws_mask); - - packet->queue_mask_lo = lower_32_bits(res->queue_mask); - packet->queue_mask_hi = upper_32_bits(res->queue_mask); - - return 0; -} - -static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, +int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { struct pm4_mes_map_queues *packet; @@ -238,7 +209,35 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, return 0; } -static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, +int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, + struct scheduling_resources *res) +{ + struct pm4_mes_set_resources *packet; + + packet = (struct pm4_mes_set_resources *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); + + packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_mes_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + return 0; +} + +int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, enum kfd_queue_type type, enum kfd_unmap_queues_filter filter, uint32_t filter_param, bool reset, @@ -303,7 +302,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, } -static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, +int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, uint64_t fence_address, uint32_t fence_value) { struct pm4_mes_query_status *packet; @@ -311,6 +310,7 @@ static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, packet = (struct pm4_mes_query_status *)buffer; memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, sizeof(struct pm4_mes_query_status)); @@ -328,15 +328,16 @@ static int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, return 0; } -static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) + +uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) { struct pm4_mec_release_mem *packet; packet = (struct pm4_mec_release_mem *)buffer; - memset(buffer, 0, sizeof(*packet)); + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, - sizeof(*packet)); + sizeof(struct pm4_mec_release_mem)); packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; packet->bitfields2.event_index = event_index___release_mem__end_of_pipe; @@ -354,22 +355,63 @@ static int pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer) packet->data_lo = 0; - return 0; + return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +} + +uint32_t pm_get_map_process_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_map_process); +} + +uint32_t pm_get_runlist_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_runlist); +} + +uint32_t pm_get_set_resources_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_set_resources); +} + +uint32_t pm_get_map_queues_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_map_queues); +} + +uint32_t pm_get_unmap_queues_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_unmap_queues); +} + +uint32_t pm_get_query_status_packet_size_vi(void) +{ + return sizeof(struct pm4_mes_query_status); } -const struct packet_manager_funcs kfd_vi_pm_funcs = { - .map_process = pm_map_process_vi, - .runlist = pm_runlist_vi, - .set_resources = pm_set_resources_vi, - .map_queues = pm_map_queues_vi, - .unmap_queues = pm_unmap_queues_vi, - .query_status = pm_query_status_vi, - .release_mem = pm_release_mem_vi, - .map_process_size = sizeof(struct pm4_mes_map_process), - .runlist_size = sizeof(struct pm4_mes_runlist), - .set_resources_size = sizeof(struct pm4_mes_set_resources), - .map_queues_size = sizeof(struct pm4_mes_map_queues), - .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), - .query_status_size = sizeof(struct pm4_mes_query_status), - .release_mem_size = sizeof(struct pm4_mec_release_mem) +uint32_t pm_get_release_mem_packet_size_vi(void) +{ + return sizeof(struct pm4_mec_release_mem); +} + + +static struct packet_manager_funcs kfd_vi_pm_funcs = { + .map_process = pm_map_process_vi, + .runlist = pm_runlist_vi, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_vi, + .unmap_queues = pm_unmap_queues_vi, + .query_status = pm_query_status_vi, + .release_mem = pm_release_mem_vi, + .get_map_process_packet_size = pm_get_map_process_packet_size_vi, + .get_runlist_packet_size = pm_get_runlist_packet_size_vi, + .get_set_resources_packet_size = pm_get_set_resources_packet_size_vi, + .get_map_queues_packet_size = pm_get_map_queues_packet_size_vi, + .get_unmap_queues_packet_size = pm_get_unmap_queues_packet_size_vi, + .get_query_status_packet_size = pm_get_query_status_packet_size_vi, + .get_release_mem_packet_size = pm_get_release_mem_packet_size_vi, }; + +void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver) +{ + pm->pmf = &kfd_vi_pm_funcs; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 261657f..34d44ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -63,7 +63,7 @@ MODULE_PARM_DESC(hws_max_conc_proc, int cwsr_enable = 1; module_param(cwsr_enable, int, 0444); -MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = off, 1 = on (default))"); +MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))"); int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT; module_param(max_num_of_queues_per_device, int, 0444); @@ -75,6 +75,8 @@ module_param(send_sigterm, int, 0444); MODULE_PARM_DESC(send_sigterm, "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); +static int amdkfd_init_completed; + int debug_largebar; module_param(debug_largebar, int, 0444); MODULE_PARM_DESC(debug_largebar, @@ -85,23 +87,16 @@ module_param(ignore_crat, int, 0444); MODULE_PARM_DESC(ignore_crat, "Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)"); -int noretry = 1; -module_param(noretry, int, 0644); +int vega10_noretry = 1; +module_param_named(noretry, vega10_noretry, int, 0644); MODULE_PARM_DESC(noretry, - "Set sh_mem_config.retry_disable on GFXv9+ dGPUs (0 = retry enabled, 1 = retry disabled (default))"); + "Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled, 1 = retry disabled (default))"); int priv_cp_queues; module_param(priv_cp_queues, int, 0644); MODULE_PARM_DESC(priv_cp_queues, "Enable privileged mode for CP queues (0 = off (default), 1 = on)"); -int halt_if_hws_hang; -module_param(halt_if_hws_hang, int, 0644); -MODULE_PARM_DESC(halt_if_hws_hang, "Halt if HWS hang is detected (0 = off (default), 1 = on)"); - - -static int amdkfd_init_completed; - int kgd2kfd_init(unsigned int interface_version, const struct kgd2kfd_calls **g2f) { @@ -154,7 +149,7 @@ static int __init kfd_module_init(void) err = kfd_ipc_init(); if (err < 0) - goto err_ipc; + goto err_topology; err = kfd_process_create_wq(); if (err < 0) @@ -171,8 +166,6 @@ static int __init kfd_module_init(void) return 0; err_create_wq: -err_ipc: - kfd_topology_shutdown(); err_topology: kfd_chardev_exit(); err_ioctl: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index d39e81c..8279b74 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -81,7 +81,6 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, case CHIP_POLARIS11: return mqd_manager_init_vi_tonga(type, dev); case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: return mqd_manager_init_v9(type, dev); default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index 336ea9c..dcaeda8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -94,8 +94,6 @@ struct mqd_manager { u32 *ctl_stack_used_size, u32 *save_area_used_size); - bool (*check_queue_active)(struct queue *q); - #if defined(CONFIG_DEBUG_FS) int (*debugfs_show_mqd)(struct seq_file *m, void *data); #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index 2441834..bd44a23 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -42,31 +42,6 @@ static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd) return (struct cik_sdma_rlc_registers *)mqd; } -static bool check_sdma_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - struct cik_sdma_rlc_registers *m = get_sdma_mqd(q->mqd); - - rptr = m->sdma_rlc_rb_rptr; - wptr = m->sdma_rlc_rb_wptr; - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - - return (rptr != wptr); -} - -static bool check_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - struct cik_mqd *m = get_mqd(q->mqd); - - rptr = m->cp_hqd_pq_rptr; - wptr = m->cp_hqd_pq_wptr; - - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - - return (rptr != wptr); -} - static void update_cu_mask(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { @@ -516,7 +491,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -528,7 +502,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -540,7 +513,6 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; - mqd->check_queue_active = check_sdma_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index dcd24c4..f4e8efc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -41,49 +41,6 @@ static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) return (struct v9_sdma_mqd *)mqd; } -static bool check_sdma_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - uint32_t rptr_hi, wptr_hi; - struct v9_sdma_mqd *m = get_sdma_mqd(q->mqd); - - rptr = m->sdmax_rlcx_rb_rptr; - wptr = m->sdmax_rlcx_rb_wptr; - rptr_hi = m->sdmax_rlcx_rb_rptr_hi; - wptr_hi = m->sdmax_rlcx_rb_wptr_hi; - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - pr_debug("rptr_hi=%d, wptr_hi=%d\n", rptr_hi, wptr_hi); - - return (rptr != wptr || rptr_hi != wptr_hi); -} - -static bool check_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - uint32_t cntl_stack_offset, cntl_stack_size; - struct v9_mqd *m = get_mqd(q->mqd); - - rptr = m->cp_hqd_pq_rptr; - wptr = m->cp_hqd_pq_wptr_lo % q->properties.queue_size; - cntl_stack_offset = m->cp_hqd_cntl_stack_offset; - cntl_stack_size = m->cp_hqd_cntl_stack_size; - - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - pr_debug("m->cp_hqd_cntl_stack_offset=0x%08x\n", cntl_stack_offset); - pr_debug("m->cp_hqd_cntl_stack_size=0x%08x\n", cntl_stack_size); - - if ((rptr == 0 && wptr == 0) || - cntl_stack_offset == 0xffffffff || - cntl_stack_size > 0x5000) - return false; - - /* Process is idle if both conditions are meet: - * queue's rptr equals to wptr - * control stack is empty, cntl_stack_offset = cntl_stack_size - */ - return (rptr != wptr || cntl_stack_offset != cntl_stack_size); -} - static void update_cu_mask(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { @@ -158,7 +115,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), &((*mqd_mem_obj)->gtt_mem), &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr), true); + (void *)&((*mqd_mem_obj)->cpu_ptr)); } else retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), mqd_mem_obj); @@ -202,7 +159,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); } - if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + if (mm->dev->cwsr_enabled) { m->cp_hqd_persistent_state |= (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); m->cp_hqd_ctx_save_base_addr_lo = @@ -260,9 +217,8 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", m->cp_hqd_pq_doorbell_control); - m->cp_hqd_ib_control = - 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | - 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT | + 1 << CP_HQD_IB_CONTROL__IB_EXE_DISABLE__SHIFT; /* * HW does not clamp this field correctly. Maximum EOP queue size @@ -287,13 +243,13 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT | 1 << CP_HQD_PQ_CONTROL__WPP_CLAMP_EN__SHIFT; - m->cp_hqd_pq_doorbell_control |= 1 << - CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; } if (priv_cp_queues) m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + if (mm->dev->cwsr_enabled) m->cp_hqd_ctx_save_control = 0; update_cu_mask(mm, mqd, q); @@ -532,7 +488,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -544,7 +499,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -556,7 +510,6 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; - mqd->check_queue_active = check_sdma_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 246fe6c..eff7580 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -44,45 +44,6 @@ static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd) return (struct vi_sdma_mqd *)mqd; } -static bool check_sdma_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - struct vi_sdma_mqd *m = get_sdma_mqd(q->mqd); - - rptr = m->sdmax_rlcx_rb_rptr; - wptr = m->sdmax_rlcx_rb_wptr; - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - - return (rptr != wptr); -} - -static bool check_queue_active(struct queue *q) -{ - uint32_t rptr, wptr; - uint32_t cntl_stack_offset, cntl_stack_size; - struct vi_mqd *m = get_mqd(q->mqd); - - rptr = m->cp_hqd_pq_rptr; - wptr = m->cp_hqd_pq_wptr; - cntl_stack_offset = m->cp_hqd_cntl_stack_offset; - cntl_stack_size = m->cp_hqd_cntl_stack_size; - - pr_debug("rptr=%d, wptr=%d\n", rptr, wptr); - pr_debug("m->cp_hqd_cntl_stack_offset=0x%08x\n", cntl_stack_offset); - pr_debug("m->cp_hqd_cntl_stack_size=0x%08x\n", cntl_stack_size); - - if ((rptr == 0 && wptr == 0) || - cntl_stack_offset == 0xffffffff || - cntl_stack_size > 0x5000) - return false; - - /* Process is idle if both conditions are meet: - * queue's rptr equals to wptr - * control stack is empty, cntl_stack_offset = cntl_stack_size - */ - return (rptr != wptr || cntl_stack_offset != cntl_stack_size); -} - static void update_cu_mask(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { @@ -198,7 +159,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, (1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT); } - if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) { + if (mm->dev->cwsr_enabled) { m->cp_hqd_persistent_state |= (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); m->cp_hqd_ctx_save_base_addr_lo = @@ -293,7 +254,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, if (priv_cp_queues) m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT; - if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) + if (mm->dev->cwsr_enabled) m->cp_hqd_ctx_save_control = atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT | mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; @@ -537,7 +498,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -549,7 +509,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; - mqd->check_queue_active = check_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif @@ -561,7 +520,6 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; - mqd->check_queue_active = check_sdma_queue_active; #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif @@ -586,3 +544,4 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, mqd->update_mqd = update_mqd_tonga; return mqd; } + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index c6080ed3..98c89d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -26,6 +26,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#include "kfd_pm4_opcodes.h" static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, unsigned int buffer_size_bytes) @@ -44,7 +45,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int process_count, queue_count, compute_queue_count; unsigned int map_queue_size; unsigned int max_proc_per_quantum = 1; - struct kfd_dev *dev = pm->dqm->dev; + + struct kfd_dev *dev = pm->dqm->dev; process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; @@ -55,20 +57,21 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ + *over_subscription = false; if (dev->max_proc_per_quantum > 1) max_proc_per_quantum = dev->max_proc_per_quantum; if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_queues_num(pm->dqm)) { + compute_queue_count > get_queues_num(pm->dqm)) { *over_subscription = true; pr_debug("Over subscribed runlist\n"); } - map_queue_size = pm->pmf->map_queues_size; + map_queue_size = pm->pmf->get_map_queues_packet_size(); /* calculate run list ib allocation size */ - *rlib_size = process_count * pm->pmf->map_process_size + + *rlib_size = process_count * pm->pmf->get_map_process_packet_size() + queue_count * map_queue_size; /* @@ -76,7 +79,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, * when over subscription */ if (*over_subscription) - *rlib_size += pm->pmf->runlist_size; + *rlib_size += pm->pmf->get_runlist_packet_size(); pr_debug("runlist ib size %d\n", *rlib_size); } @@ -157,7 +160,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; proccesses_mapped++; - inc_wptr(&rl_wptr, pm->pmf->map_process_size, + inc_wptr(&rl_wptr, pm->pmf->get_map_process_packet_size(), alloc_size_bytes); list_for_each_entry(kq, &qpd->priv_queue_list, list) { @@ -175,7 +178,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; inc_wptr(&rl_wptr, - pm->pmf->map_queues_size, + pm->pmf->get_map_queues_packet_size(), alloc_size_bytes); } @@ -190,12 +193,11 @@ static int pm_create_runlist_ib(struct packet_manager *pm, &rl_buffer[rl_wptr], q, qpd->is_debug); - if (retval) return retval; inc_wptr(&rl_wptr, - pm->pmf->map_queues_size, + pm->pmf->get_map_queues_packet_size(), alloc_size_bytes); } } @@ -215,38 +217,37 @@ static int pm_create_runlist_ib(struct packet_manager *pm, return retval; } -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver) { - switch (dqm->dev->device_info->asic_family) { + pm->dqm = dqm; + mutex_init(&pm->lock); + pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); + if (!pm->priv_queue) { + mutex_destroy(&pm->lock); + return -ENOMEM; + } + pm->allocated = false; + + switch (pm->dqm->dev->device_info->asic_family) { case CHIP_KAVERI: case CHIP_HAWAII: - /* PM4 packet structures on CIK are the same as on VI */ + kfd_pm_func_init_cik(pm, fw_ver); + break; case CHIP_CARRIZO: case CHIP_TONGA: case CHIP_FIJI: case CHIP_POLARIS10: case CHIP_POLARIS11: - pm->pmf = &kfd_vi_pm_funcs; + kfd_pm_func_init_vi(pm, fw_ver); break; case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: - pm->pmf = &kfd_v9_pm_funcs; + kfd_pm_func_init_v9(pm, fw_ver); break; default: - WARN(1, "Unexpected ASIC family %u", - dqm->dev->device_info->asic_family); - return -EINVAL; - } - - pm->dqm = dqm; - mutex_init(&pm->lock); - pm->priv_queue = kernel_queue_init(dqm->dev, KFD_QUEUE_TYPE_HIQ); - if (!pm->priv_queue) { - mutex_destroy(&pm->lock); - return -ENOMEM; + BUG(); } - pm->allocated = false; return 0; } @@ -263,7 +264,7 @@ int pm_send_set_resources(struct packet_manager *pm, uint32_t *buffer, size; int retval = 0; - size = pm->pmf->set_resources_size; + size = pm->pmf->get_set_resources_packet_size(); mutex_lock(&pm->lock); pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), @@ -300,7 +301,8 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) pr_debug("runlist IB address: 0x%llX\n", rl_gpu_ib_addr); - packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t); + packet_size_dwords = pm->pmf->get_runlist_packet_size() / + sizeof(uint32_t); mutex_lock(&pm->lock); retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, @@ -309,7 +311,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) goto fail_acquire_packet_buffer; retval = pm->pmf->runlist(pm, rl_buffer, rl_gpu_ib_addr, - rl_ib_size / sizeof(uint32_t), false); + rl_ib_size / sizeof(uint32_t), false); if (retval) goto fail_create_runlist; @@ -337,7 +339,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, if (WARN_ON(!fence_address)) return -EFAULT; - size = pm->pmf->query_status_size; + size = pm->pmf->get_query_status_packet_size(); mutex_lock(&pm->lock); pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); @@ -366,7 +368,7 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, uint32_t *buffer, size; int retval = 0; - size = pm->pmf->unmap_queues_size; + size = pm->pmf->get_unmap_queues_packet_size(); mutex_lock(&pm->lock); pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); @@ -398,51 +400,17 @@ void pm_release_ib(struct packet_manager *pm) mutex_unlock(&pm->lock); } -#if defined(CONFIG_DEBUG_FS) - int pm_debugfs_runlist(struct seq_file *m, void *data) { struct packet_manager *pm = data; - mutex_lock(&pm->lock); - if (!pm->allocated) { seq_puts(m, " No active runlist\n"); - goto out; + return 0; } seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false); -out: - mutex_unlock(&pm->lock); return 0; } - -int pm_debugfs_hang_hws(struct packet_manager *pm) -{ - uint32_t *buffer, size; - int r = 0; - - size = pm->pmf->query_status_size; - mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, - size / sizeof(uint32_t), (unsigned int **)&buffer); - if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); - r = -ENOMEM; - goto out; - } - memset(buffer, 0x55, size); - pm->priv_queue->ops.submit_packet(pm->priv_queue); - - pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", - buffer[0], buffer[1], buffer[2], buffer[3], - buffer[4], buffer[5], buffer[6]); -out: - mutex_unlock(&pm->lock); - return r; -} - - -#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c index 87344cc..fae8e8c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c @@ -49,9 +49,9 @@ #include #include #include -#include #include "kfd_priv.h" +#include "amd_rdma.h" @@ -137,6 +137,7 @@ static void (*pfn_ib_unregister_peer_memory_client)(void *reg_handle); static const struct amd_rdma_interface *rdma_interface; +static invalidate_peer_memory ib_invalidate_callback; static void *ib_reg_handle; struct amd_mem_context { @@ -168,6 +169,9 @@ static void free_callback(void *client_priv) pr_debug("mem_context->core_context 0x%p\n", mem_context->core_context); + /* Call back IB stack asking to invalidate memory */ + (*ib_invalidate_callback) (ib_reg_handle, mem_context->core_context); + /* amdkfd will free resources when we return from this callback. * Set flag to inform that there is nothing to do on "put_pages", etc. */ @@ -474,7 +478,7 @@ void kfd_init_peer_direct(void) strcpy(amd_mem_client.version, AMD_PEER_BRIDGE_DRIVER_VERSION); ib_reg_handle = pfn_ib_register_peer_memory_client(&amd_mem_client, - NULL); + &ib_invalidate_callback); if (!ib_reg_handle) { pr_err("Cannot register peer memory client\n"); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 7869a9d..b2ef0f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -30,15 +30,16 @@ #include #include #include -#include #include +#include +#include +#include #include #include #include -#include -#include #include +#include "amd_rdma.h" #include "amd_shared.h" #define KFD_SYSFS_FILE_MODE 0444 @@ -49,7 +50,8 @@ /* Use upper bits of mmap offset to store KFD driver specific information. * BITS[63:62] - Encode MMAP type * BITS[61:46] - Encode gpu_id. To identify to which GPU the offset belongs to - * BITS[45:0] - MMAP offset value + * BITS[45:40] - Reserved. Not Used. + * BITS[39:0] - MMAP offset value. Used by TTM. * * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these * defines are w.r.t to PAGE_SIZE @@ -68,7 +70,7 @@ #define KFD_MMAP_GPU_ID_GET(offset) ((offset & KFD_MMAP_GPU_ID_MASK) \ >> KFD_MMAP_GPU_ID_SHIFT) -#define KFD_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFULL >> PAGE_SHIFT) +#define KFD_MMAP_OFFSET_VALUE_MASK (0xFFFFFFFFFFULL >> PAGE_SHIFT) #define KFD_MMAP_OFFSET_VALUE_GET(offset) (offset & KFD_MMAP_OFFSET_VALUE_MASK) /* @@ -81,6 +83,7 @@ #define KFD_CIK_HIQ_PIPE 4 #define KFD_CIK_HIQ_QUEUE 0 + /* Macro for allocating structures */ #define kfd_alloc_struct(ptr_to_struct) \ ((typeof(ptr_to_struct)) kzalloc(sizeof(*ptr_to_struct), GFP_KERNEL)) @@ -113,14 +116,14 @@ extern int max_num_of_queues_per_device; /* Kernel module parameter to specify the scheduling policy */ extern int sched_policy; +extern int cwsr_enable; + /* * Kernel module parameter to specify the maximum process * number per HW scheduler */ extern int hws_max_conc_proc; -extern int cwsr_enable; - /* * Kernel module parameter to specify whether to send sigterm to HSA process on * unhandled exception @@ -142,18 +145,13 @@ extern int ignore_crat; /* * Set sh_mem_config.retry_disable on Vega10 */ -extern int noretry; +extern int vega10_noretry; /* * Enable privileged mode for all CP queues including user queues */ extern int priv_cp_queues; -/* - * Halt if HWS hang is detected - */ -extern int halt_if_hws_hang; - /** * enum kfd_sched_policy * @@ -210,7 +208,6 @@ struct kfd_device_info { bool needs_pci_atomics; /* obtain from adev->sdma.num_instances */ unsigned int num_sdma_engines; - unsigned int num_sdma_queues_per_engine; }; struct kfd_mem_obj { @@ -294,8 +291,6 @@ struct kfd_dev { bool cwsr_enabled; const void *cwsr_isa; unsigned int cwsr_isa_size; - - bool pci_atomic_requested; }; struct kfd_ipc_obj; @@ -306,41 +301,6 @@ struct kfd_bo { struct kfd_dev *dev; struct list_head cb_data_head; struct kfd_ipc_obj *kfd_ipc_obj; - /* page-aligned VA address */ - uint64_t cpuva; - unsigned int mem_type; -}; - -struct cma_system_bo { - struct kgd_mem *mem; - struct sg_table *sg; - struct kfd_dev *dev; - struct list_head list; -}; - -/* Similar to iov_iter */ -struct cma_iter { - /* points to current entry of range array */ - struct kfd_memory_range *array; - /* total number of entries in the initial array */ - unsigned long nr_segs; - /* total amount of data pointed by kfd array*/ - unsigned long total; - /* offset into the entry pointed by cma_iter.array */ - unsigned long offset; - struct kfd_process *p; - struct mm_struct *mm; - struct task_struct *task; - /* current kfd_bo associated with cma_iter.array.va_addr */ - struct kfd_bo *cur_bo; - /* offset w.r.t cur_bo */ - unsigned long bo_offset; - /* If cur_bo is a userptr BO, then a shadow system BO is created - * using its underlying pages. cma_bo holds this BO. cma_list is a - * list cma_bos created in one session - */ - struct cma_system_bo *cma_bo; - struct list_head cma_list; }; /* KGD2KFD callbacks */ @@ -444,11 +404,7 @@ enum KFD_QUEUE_PRIORITY { * @is_interop: Defines if this is a interop queue. Interop queue means that * the queue can access both graphics and compute resources. * - * @is_evicted: Defines if the queue is evicted. Only active queues - * are evicted, rendering them inactive. - * - * @is_active: Defines if the queue is active or not. @is_active and - * @is_evicted are protected by the DQM lock. + * @is_active: Defines if the queue is active or not. * * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid * of the queue. @@ -470,7 +426,7 @@ struct queue_properties { void __iomem *doorbell_ptr; uint32_t doorbell_off; bool is_interop; - bool is_evicted; + bool is_evicted; /* true -> queue is evicted */ bool is_active; /* Not relevant for user mode queues in cp scheduling */ unsigned int vmid; @@ -589,6 +545,7 @@ struct qcm_process_device { struct list_head priv_queue_list; unsigned int queue_count; + /* a data field only meaningful for non-HWS case */ unsigned int vmid; bool is_debug; unsigned int evicted; /* eviction counter, 0=active */ @@ -602,11 +559,11 @@ struct qcm_process_device { * All the memory management data should be here too */ uint64_t gds_context_area; - uint64_t page_table_base; uint32_t sh_mem_config; uint32_t sh_mem_bases; uint32_t sh_mem_ape1_base; uint32_t sh_mem_ape1_limit; + uint32_t page_table_base; uint32_t gds_size; uint32_t num_gws; uint32_t num_oac; @@ -619,11 +576,11 @@ struct qcm_process_device { uint64_t tma_addr; /* IB memory */ - uint64_t ib_base; + uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */ void *ib_kaddr; /*doorbell resources per process per device*/ - unsigned long *doorbell_bitmap; + unsigned long *doorbell_bitmap; }; /* KFD Memory Eviction */ @@ -635,10 +592,11 @@ struct qcm_process_device { /* Approx. time before evicting the process again */ #define PROCESS_ACTIVE_TIME_MS 10 -int kgd2kfd_quiesce_mm(struct mm_struct *mm); -int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, struct dma_fence *fence); +int kfd_process_evict_queues(struct kfd_process *p); +int kfd_process_restore_queues(struct kfd_process *p); + /* 8 byte handle containing GPU ID in the most significant 4 bytes and * idr_handle in the least significant 4 bytes @@ -754,14 +712,13 @@ struct kfd_process { struct idr event_idr; /* Event page */ struct kfd_signal_page *signal_page; - size_t signal_mapped_size; size_t signal_event_count; bool signal_event_limit_reached; struct rb_root_cached bo_interval_tree; /* Information used for memory eviction */ - void *kgd_process_info; + void *process_info; /* Eviction fence that is attached to all the BOs of this process. The * fence will be triggered during eviction and new one will be created * during restore @@ -804,32 +761,29 @@ struct amdkfd_ioctl_desc { int kfd_process_create_wq(void); void kfd_process_destroy_wq(void); struct kfd_process *kfd_create_process(struct file *filep); -struct kfd_process *kfd_get_process(const struct task_struct *); +struct kfd_process *kfd_get_process(const struct task_struct *task); struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid); struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm); void kfd_unref_process(struct kfd_process *p); -int kfd_process_evict_queues(struct kfd_process *p); -int kfd_process_restore_queues(struct kfd_process *p); void kfd_suspend_all_processes(void); int kfd_resume_all_processes(void); int kfd_process_device_init_vm(struct kfd_process_device *pdd, struct file *drm_file); struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, - struct kfd_process *p); + struct kfd_process *p); struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process *p); struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, struct kfd_process *p); -int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, - struct vm_area_struct *vma); +int kfd_reserved_mem_mmap(struct kfd_process *process, + struct vm_area_struct *vma); /* KFD process API for creating and translating handles */ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, void *mem, uint64_t start, - uint64_t length, uint64_t cpuva, - unsigned int mem_type, + uint64_t length, struct kfd_ipc_obj *ipc_obj); void *kfd_process_device_translate_handle(struct kfd_process_device *p, int handle); @@ -864,7 +818,7 @@ void kfd_pasid_free(unsigned int pasid); size_t kfd_doorbell_process_slice(struct kfd_dev *kfd); int kfd_doorbell_init(struct kfd_dev *kfd); void kfd_doorbell_fini(struct kfd_dev *kfd); -int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, +int kfd_doorbell_mmap(struct kfd_dev *kfd, struct kfd_process *process, struct vm_area_struct *vma); void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, unsigned int *doorbell_off); @@ -921,6 +875,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd); /* amdkfd Apertures */ int kfd_init_apertures(struct kfd_process *process); +int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd, + uint64_t base, uint64_t limit); /* Queue Context Management */ int init_queue(struct queue **q, const struct queue_properties *properties); @@ -975,6 +931,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, void __user *ctl_stack, u32 *ctl_stack_used_size, u32 *save_area_used_size); +int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm); +int kgd2kfd_resume_mm(struct kfd_dev *kfd, struct mm_struct *mm); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, @@ -985,6 +943,8 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, #define KFD_FENCE_COMPLETED (100) #define KFD_FENCE_INIT (10) +struct packet_manager_func; + struct packet_manager { struct device_queue_manager *dqm; struct kernel_queue *priv_queue; @@ -993,11 +953,11 @@ struct packet_manager { struct kfd_mem_obj *ib_buffer_obj; unsigned int ib_size_bytes; - const struct packet_manager_funcs *pmf; + struct packet_manager_funcs *pmf; }; struct packet_manager_funcs { - /* Support ASIC-specific packet formats for PM4 packets */ + /* Support different firmware versions for PM4 packets */ int (*map_process)(struct packet_manager *pm, uint32_t *buffer, struct qcm_process_device *qpd); int (*runlist)(struct packet_manager *pm, uint32_t *buffer, @@ -1013,22 +973,20 @@ struct packet_manager_funcs { unsigned int sdma_engine); int (*query_status)(struct packet_manager *pm, uint32_t *buffer, uint64_t fence_address, uint32_t fence_value); - int (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); - - /* Packet sizes */ - int map_process_size; - int runlist_size; - int set_resources_size; - int map_queues_size; - int unmap_queues_size; - int query_status_size; - int release_mem_size; -}; + uint32_t (*release_mem)(uint64_t gpu_addr, uint32_t *buffer); -extern const struct packet_manager_funcs kfd_vi_pm_funcs; -extern const struct packet_manager_funcs kfd_v9_pm_funcs; + uint32_t (*get_map_process_packet_size)(void); + uint32_t (*get_runlist_packet_size)(void); + uint32_t (*get_set_resources_packet_size)(void); + uint32_t (*get_map_queues_packet_size)(void); + uint32_t (*get_unmap_queues_packet_size)(void); + uint32_t (*get_query_status_packet_size)(void); + uint32_t (*get_release_mem_packet_size)(void); -int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); +}; + +int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm, + uint16_t fw_ver); void pm_uninit(struct packet_manager *pm); int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res); @@ -1043,10 +1001,37 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, void pm_release_ib(struct packet_manager *pm); -/* Following PM funcs can be shared among VI and AI */ +/* Following PM funcs can be shared among CIK and VI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); +int pm_runlist_vi(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain); +int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static); int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, struct scheduling_resources *res); +int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine); +int pm_query_status_vi(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value); +uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer); + +uint32_t pm_get_map_process_packet_size_vi(void); +uint32_t pm_get_runlist_packet_size_vi(void); +uint32_t pm_get_set_resources_packet_size_vi(void); +uint32_t pm_get_map_queues_packet_size_vi(void); +uint32_t pm_get_unmap_queues_packet_size_vi(void); +uint32_t pm_get_query_status_packet_size_vi(void); +uint32_t pm_get_release_mem_packet_size_vi(void); + + +void kfd_pm_func_init_vi(struct packet_manager *pm, uint16_t fw_ver); +void kfd_pm_func_init_cik(struct packet_manager *pm, uint16_t fw_ver); + +void kfd_pm_func_init_v9(struct packet_manager *pm, uint16_t fw_ver); + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); @@ -1071,24 +1056,21 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, void kfd_signal_hw_exception_event(unsigned int pasid); int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id); -int kfd_event_page_set(struct kfd_process *p, void *kernel_address, - uint64_t size); int kfd_event_create(struct file *devkfd, struct kfd_process *p, uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t *event_id, uint32_t *event_trigger_data, - uint64_t *event_page_offset, uint32_t *event_slot_index); + uint64_t *event_page_offset, uint32_t *event_slot_index, + void *kern_addr); int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, struct kfd_vm_fault_info *info); -void kfd_signal_reset_event(struct kfd_dev *dev); - void kfd_flush_tlb(struct kfd_process_device *pdd); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); -bool kfd_is_locked(void); +#define KFD_SCRATCH_KV_FW_VER 413 /* PeerDirect support */ void kfd_init_peer_direct(void); @@ -1109,10 +1091,6 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data); int kfd_debugfs_rls_by_device(struct seq_file *m, void *data); int pm_debugfs_runlist(struct seq_file *m, void *data); -int kfd_debugfs_hang_hws(struct kfd_dev *dev); -int pm_debugfs_hang_hws(struct packet_manager *pm); -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm); - #else static inline void kfd_debugfs_init(void) {} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index da67302..c627b63 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include "kfd_ipc.h" @@ -61,6 +60,9 @@ static struct workqueue_struct *kfd_process_wq; */ static struct workqueue_struct *kfd_restore_wq; +#define MIN_IDR_ID 1 +#define MAX_IDR_ID 0 /*0 - for unlimited*/ + static struct kfd_process *find_process(const struct task_struct *thread, bool ref); static void kfd_process_ref_release(struct kref *ref); @@ -78,12 +80,7 @@ int kfd_process_create_wq(void) if (!kfd_restore_wq) kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0); - if (!kfd_process_wq || !kfd_restore_wq) { - kfd_process_destroy_wq(); - return -ENOMEM; - } - - return 0; + return kfd_process_wq && kfd_restore_wq ? 0 : -ENOMEM; } void kfd_process_destroy_wq(void) @@ -121,11 +118,9 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd, struct kgd_mem *mem = NULL; int handle; int err; - unsigned int mem_type; err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, - pdd->vm, NULL, &mem, NULL, - flags); + pdd->vm, &mem, NULL, flags); if (err) goto err_alloc_mem; @@ -139,18 +134,13 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd, goto sync_memory_failed; } - mem_type = flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | - KFD_IOC_ALLOC_MEM_FLAGS_GTT | - KFD_IOC_ALLOC_MEM_FLAGS_USERPTR | - KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL); - /* Create an obj handle so kfd_process_device_remove_obj_handle * will take care of the bo removal when the process finishes. * We do not need to take p->mutex, because the process is just * created and the ioctls have not had the chance to run. */ handle = kfd_process_device_create_obj_handle( - pdd, mem, gpu_va, size, 0, mem_type, NULL); + pdd, mem, gpu_va, size, NULL); if (handle < 0) { err = handle; @@ -185,16 +175,14 @@ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd, /* kfd_process_device_reserve_ib_mem - Reserve memory inside the * process for IB usage The memory reserved is for KFD to submit * IB to AMDGPU from kernel. If the memory is reserved - * successfully, ib_kaddr will have the CPU/kernel - * address. Check ib_kaddr before accessing the memory. + * successfully, ib_kaddr_assigned will have the CPU/kernel + * address. Check ib_kaddr_assigned before accessing the memory. */ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd) { struct qcm_process_device *qpd = &pdd->qpd; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | - ALLOC_MEM_FLAGS_WRITABLE | - ALLOC_MEM_FLAGS_EXECUTABLE; + uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | + ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTE_ACCESS; void *kaddr; int ret; @@ -215,6 +203,7 @@ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd) struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; + struct task_struct *thread = current; if (!thread->mm) @@ -255,8 +244,6 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) return ERR_PTR(-EINVAL); process = find_process(thread, false); - if (!process) - return ERR_PTR(-EINVAL); return process; } @@ -352,9 +339,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) list_for_each_entry_safe(pdd, temp, &p->per_device_data, per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", - pdd->dev->id, p->pasid); - + /* Destroy the GPUVM VM context */ if (pdd->drm_file) fput(pdd->drm_file); else if (pdd->vm) @@ -407,6 +392,9 @@ static void kfd_process_ref_release(struct kref *ref) { struct kfd_process *p = container_of(ref, struct kfd_process, ref); + if (WARN_ON(!kfd_process_wq)) + return; + INIT_WORK(&p->release_work, kfd_process_wq_release); queue_work(kfd_process_wq, &p->release_work); } @@ -487,19 +475,17 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) if (!dev->cwsr_enabled || qpd->cwsr_kaddr || qpd->cwsr_base) continue; - offset = (KFD_MMAP_TYPE_RESERVED_MEM | KFD_MMAP_GPU_ID(dev->id)) - << PAGE_SHIFT; - qpd->tba_addr = (int64_t)vm_mmap(filep, 0, - KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, - MAP_SHARED, offset); + offset = (dev->id | KFD_MMAP_TYPE_RESERVED_MEM) << PAGE_SHIFT; + qpd->tba_addr = (uint64_t)vm_mmap(filep, 0, + KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC, + MAP_SHARED, offset); if (IS_ERR_VALUE(qpd->tba_addr)) { - int err = qpd->tba_addr; - - pr_err("Failure to set tba address. error %d.\n", err); + pr_err("Failure to set tba address. error -%d.\n", + (int)qpd->tba_addr); qpd->tba_addr = 0; qpd->cwsr_kaddr = NULL; - return err; + return -ENOMEM; } memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size); @@ -516,8 +502,9 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; struct qcm_process_device *qpd = &pdd->qpd; - uint32_t flags = ALLOC_MEM_FLAGS_GTT | - ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_EXECUTABLE; + uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED | + ALLOC_MEM_FLAGS_NO_SUBSTITUTE | ALLOC_MEM_FLAGS_READONLY | + ALLOC_MEM_FLAGS_EXECUTE_ACCESS; void *kaddr; int ret; @@ -675,12 +662,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, if (!pdd) return NULL; - if (init_doorbell_bitmap(&pdd->qpd, dev)) { - pr_err("Failed to init doorbell for process\n"); - kfree(pdd); - return NULL; - } - pdd->dev = dev; INIT_LIST_HEAD(&pdd->qpd.queues_list); INIT_LIST_HEAD(&pdd->qpd.priv_queue_list); @@ -694,8 +675,19 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, /* Init idr used for memory handle translation */ idr_init(&pdd->alloc_idr); + if (init_doorbell_bitmap(&pdd->qpd, dev)) { + pr_err("Failed to init doorbell for process\n"); + goto err_create_pdd; + } return pdd; + +err_create_pdd: + kfree(pdd->qpd.doorbell_bitmap); + idr_destroy(&pdd->alloc_idr); + list_del(&pdd->per_device_list); + kfree(pdd); + return NULL; } /** @@ -720,18 +712,17 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, int ret; if (pdd->vm) - return drm_file ? -EBUSY : 0; + return 0; p = pdd->process; dev = pdd->dev; if (drm_file) ret = dev->kfd2kgd->acquire_process_vm( - dev->kgd, drm_file, p->pasid, - &pdd->vm, &p->kgd_process_info, &p->ef); + dev->kgd, drm_file, &pdd->vm, &p->process_info, &p->ef); else ret = dev->kfd2kgd->create_process_vm( - dev->kgd, p->pasid, &pdd->vm, &p->kgd_process_info, &p->ef); + dev->kgd, &pdd->vm, &p->process_info, &p->ef); if (ret) { pr_err("Failed to create process VM object\n"); return ret; @@ -815,8 +806,7 @@ bool kfd_has_process_device_data(struct kfd_process *p) */ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, void *mem, uint64_t start, - uint64_t length, uint64_t cpuva, - unsigned int mem_type, + uint64_t length, struct kfd_ipc_obj *ipc_obj) { int handle; @@ -837,12 +827,15 @@ int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd, buf_obj->mem = mem; buf_obj->dev = pdd->dev; buf_obj->kfd_ipc_obj = ipc_obj; - buf_obj->cpuva = cpuva; - buf_obj->mem_type = mem_type; INIT_LIST_HEAD(&buf_obj->cb_data_head); - handle = idr_alloc(&pdd->alloc_idr, buf_obj, 0, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + + handle = idr_alloc(&pdd->alloc_idr, buf_obj, MIN_IDR_ID, MAX_IDR_ID, + GFP_NOWAIT); + + idr_preload_end(); if (handle < 0) kfree(buf_obj); @@ -945,6 +938,42 @@ struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid) return ret_p; } +void kfd_suspend_all_processes(void) +{ + struct kfd_process *p; + unsigned int temp; + int idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + cancel_delayed_work_sync(&p->eviction_work); + cancel_delayed_work_sync(&p->restore_work); + + if (kfd_process_evict_queues(p)) + pr_err("Failed to suspend process %d\n", p->pasid); + dma_fence_signal(p->ef); + dma_fence_put(p->ef); + p->ef = NULL; + } + srcu_read_unlock(&kfd_processes_srcu, idx); +} + +int kfd_resume_all_processes(void) +{ + struct kfd_process *p; + unsigned int temp; + int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); + + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) { + pr_err("Restore process %d failed during resume\n", + p->pasid); + ret = -EFAULT; + } + } + srcu_read_unlock(&kfd_processes_srcu, idx); + return ret; +} + /* This increments the process->ref counter. */ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) { @@ -1036,14 +1065,15 @@ static void evict_process_worker(struct work_struct *work) "Eviction fence mismatch\n"); /* Narrow window of overlap between restore and evict work - * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos - * unreserves KFD BOs, it is possible to evicted again. But - * restore has few more steps of finish. So lets wait for any - * previous restore work to complete + * item is possible. Once + * amdgpu_amdkfd_gpuvm_restore_process_bos unreserves KFD BOs, + * it is possible to evicted again. But restore has few more + * steps of finish. So lets wait for any previous restore work + * to complete */ flush_delayed_work(&p->restore_work); - pr_info("Started evicting pasid %d\n", p->pasid); + pr_info("Started evicting process of pasid %d\n", p->pasid); ret = kfd_process_evict_queues(p); if (!ret) { dma_fence_signal(p->ef); @@ -1052,9 +1082,10 @@ static void evict_process_worker(struct work_struct *work) queue_delayed_work(kfd_restore_wq, &p->restore_work, msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - pr_info("Finished evicting pasid %d\n", p->pasid); + pr_info("Finished evicting process of pasid %d\n", p->pasid); } else - pr_err("Failed to evict queues of pasid %d\n", p->pasid); + pr_err("Failed to quiesce user queues. Cannot evict pasid %d\n", + p->pasid); } static void restore_process_worker(struct work_struct *work) @@ -1080,7 +1111,7 @@ static void restore_process_worker(struct work_struct *work) struct kfd_process_device, per_device_list); - pr_info("Started restoring pasid %d\n", p->pasid); + pr_info("Started restoring process of pasid %d\n", p->pasid); /* Setting last_restore_timestamp before successful restoration. * Otherwise this would have to be set by KGD (restore_process_bos) @@ -1093,11 +1124,10 @@ static void restore_process_worker(struct work_struct *work) */ p->last_restore_timestamp = get_jiffies_64(); - ret = pdd->dev->kfd2kgd->restore_process_bos(p->kgd_process_info, - &p->ef); + ret = pdd->dev->kfd2kgd->restore_process_bos(p->process_info, &p->ef); if (ret) { - pr_info("Failed to restore BOs of pasid %d, retry after %d ms\n", - p->pasid, PROCESS_BACK_OFF_TIME_MS); + pr_info("Restore failed, try again after %d ms\n", + PROCESS_BACK_OFF_TIME_MS); ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); WARN(!ret, "reschedule restore work failed\n"); @@ -1105,54 +1135,21 @@ static void restore_process_worker(struct work_struct *work) } ret = kfd_process_restore_queues(p); - if (!ret) - pr_info("Finished restoring pasid %d\n", p->pasid); - else - pr_err("Failed to restore queues of pasid %d\n", p->pasid); -} - -void kfd_suspend_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int idx = srcu_read_lock(&kfd_processes_srcu); - - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - cancel_delayed_work_sync(&p->eviction_work); - cancel_delayed_work_sync(&p->restore_work); - - if (kfd_process_evict_queues(p)) - pr_err("Failed to suspend process %d\n", p->pasid); - dma_fence_signal(p->ef); - dma_fence_put(p->ef); - p->ef = NULL; - } - srcu_read_unlock(&kfd_processes_srcu, idx); -} - -int kfd_resume_all_processes(void) -{ - struct kfd_process *p; - unsigned int temp; - int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu); + if (ret) + pr_err("Failed to resume user queues\n"); - hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) { - pr_err("Restore process %d failed during resume\n", - p->pasid); - ret = -EFAULT; - } - } - srcu_read_unlock(&kfd_processes_srcu, idx); - return ret; + pr_info("Finished restoring process of pasid %d\n", p->pasid); } -int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, - struct vm_area_struct *vma) +int kfd_reserved_mem_mmap(struct kfd_process *process, + struct vm_area_struct *vma) { + struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff); struct kfd_process_device *pdd; struct qcm_process_device *qpd; + if (!dev) + return -EINVAL; if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) { pr_err("Incorrect CWSR mapping size.\n"); return -EINVAL; @@ -1178,6 +1175,7 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, struct kfd_process *process, KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot); } + void kfd_flush_tlb(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; @@ -1212,7 +1210,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) r = pqm_debugfs_mqds(m, &p->pqm); mutex_unlock(&p->mutex); - if (r) + if (r != 0) break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 8933323..52882e0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -188,7 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, case KFD_QUEUE_TYPE_SDMA: if (dev->dqm->sdma_queue_count >= get_num_sdma_queues(dev->dqm)) { - pr_debug("Over-subscription is not allowed for SDMA.\n"); + pr_debug("Over-subscription is not allowed for SDMA\n"); retval = -EPERM; goto err_create_queue; } @@ -206,7 +206,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, case KFD_QUEUE_TYPE_COMPUTE: /* check if there is over subscription */ if ((dev->dqm->sched_policy == - KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && + KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) && ((dev->dqm->processes_count >= dev->vm_info.vmid_num_kfd) || (dev->dqm->queue_count >= get_queues_num(dev->dqm)))) { pr_debug("Over-subscription is not allowed in radeon_kfd.sched_policy == 1\n"); @@ -241,8 +241,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, } if (retval != 0) { - pr_err("Pasid %d DQM create queue %d failed. ret %d\n", - pqm->process->pasid, type, retval); + pr_err("DQM create queue failed\n"); goto err_create_queue; } @@ -318,16 +317,13 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (pqn->q) { dqm = pqn->q->device->dqm; + kfree(pqn->q->properties.cu_mask); + pqn->q->properties.cu_mask = NULL; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { - pr_err("Pasid %d destroy queue %d failed, ret %d\n", - pqm->process->pasid, - pqn->q->properties.queue_id, retval); - if (retval != -ETIME) - goto err_destroy_queue; + pr_debug("Destroy queue failed, returned %d\n", retval); + goto err_destroy_queue; } - kfree(pqn->q->properties.cu_mask); - pqn->q->properties.cu_mask = NULL; uninit_queue(pqn->q); } @@ -439,7 +435,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) struct process_queue_node *pqn; struct queue *q; enum KFD_MQD_TYPE mqd_type; - struct mqd_manager *mqd_mgr; + struct mqd_manager *mqd_manager; int r = 0; list_for_each_entry(pqn, &pqm->queues, process_queue_list) { @@ -462,11 +458,11 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q->properties.type, q->device->id); continue; } - mqd_mgr = q->device->dqm->ops.get_mqd_manager( + mqd_manager = q->device->dqm->ops.get_mqd_manager( q->device->dqm, mqd_type); } else if (pqn->kq) { q = pqn->kq->queue; - mqd_mgr = pqn->kq->mqd_mgr; + mqd_manager = pqn->kq->mqd; switch (q->properties.type) { case KFD_QUEUE_TYPE_DIQ: seq_printf(m, " DIQ on device %x\n", @@ -486,7 +482,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) continue; } - r = mqd_mgr->debugfs_show_mqd(m, q->mqd); + r = mqd_manager->debugfs_show_mqd(m, q->mqd); if (r != 0) break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index 6dcd621..a5315d4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -36,8 +36,8 @@ void print_queue_properties(struct queue_properties *q) pr_debug("Queue Address: 0x%llX\n", q->queue_address); pr_debug("Queue Id: %u\n", q->queue_id); pr_debug("Queue Process Vmid: %u\n", q->vmid); - pr_debug("Queue Read Pointer: 0x%px\n", q->read_ptr); - pr_debug("Queue Write Pointer: 0x%px\n", q->write_ptr); + pr_debug("Queue Read Pointer: 0x%p\n", q->read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->write_ptr); pr_debug("Queue Doorbell Pointer: 0x%p\n", q->doorbell_ptr); pr_debug("Queue Doorbell Offset: %u\n", q->doorbell_off); } @@ -53,8 +53,8 @@ void print_queue(struct queue *q) pr_debug("Queue Address: 0x%llX\n", q->properties.queue_address); pr_debug("Queue Id: %u\n", q->properties.queue_id); pr_debug("Queue Process Vmid: %u\n", q->properties.vmid); - pr_debug("Queue Read Pointer: 0x%px\n", q->properties.read_ptr); - pr_debug("Queue Write Pointer: 0x%px\n", q->properties.write_ptr); + pr_debug("Queue Read Pointer: 0x%p\n", q->properties.read_ptr); + pr_debug("Queue Write Pointer: 0x%p\n", q->properties.write_ptr); pr_debug("Queue Doorbell Pointer: 0x%p\n", q->properties.doorbell_ptr); pr_debug("Queue Doorbell Offset: %u\n", q->properties.doorbell_off); pr_debug("Queue MQD Address: 0x%p\n", q->mqd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c index 3454514..985855f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_rdma.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include "amd_rdma.h" #include "kfd_priv.h" diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 7702156..320c8d3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -196,7 +196,6 @@ struct kfd_topology_device *kfd_create_topology_device( return dev; } - #define sysfs_show_gen_prop(buffer, fmt, ...) \ snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) #define sysfs_show_32bit_prop(buffer, name, value) \ @@ -740,7 +739,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, } /* All hardware blocks have the same number of attributes. */ - num_attrs = ARRAY_SIZE(perf_attr_iommu); + num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr); list_for_each_entry(perf, &dev->perf_props, list) { perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr) * num_attrs + sizeof(struct attribute_group), @@ -891,8 +890,7 @@ static void kfd_debug_print_topology(void) up_read(&topology_lock); } -/* Helper function for intializing platform_xx members of - * kfd_system_properties. Uses OEM info from the last CPU/APU node. +/* Helper function for intializing platform_xx members of kfd_system_properties */ static void kfd_update_system_properties(void) { @@ -1015,12 +1013,13 @@ int kfd_topology_init(void) */ #ifdef CONFIG_ACPI ret = kfd_create_crat_image_acpi(&crat_image, &image_size); - if (!ret) { + if (ret == 0) { ret = kfd_parse_crat_table(crat_image, &temp_topology_device_list, proximity_domain); if (ret || - kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { + kfd_is_acpi_crat_invalid(&temp_topology_device_list)) { + kfd_release_topology_device_list( &temp_topology_device_list); kfd_destroy_crat_image(crat_image); @@ -1030,8 +1029,8 @@ int kfd_topology_init(void) #endif if (!crat_image) { ret = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_CPU, NULL, - proximity_domain); + COMPUTE_UNIT_CPU, NULL, + proximity_domain); cpu_only_node = 1; if (ret) { pr_err("Error creating VCRAT table for CPU\n"); @@ -1039,8 +1038,8 @@ int kfd_topology_init(void) } ret = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); + &temp_topology_device_list, + proximity_domain); if (ret) { pr_err("Error parsing VCRAT table for CPU\n"); goto err; @@ -1052,12 +1051,12 @@ int kfd_topology_init(void) down_write(&topology_lock); kfd_topology_update_device_list(&temp_topology_device_list, - &topology_device_list); + &topology_device_list); atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1); ret = kfd_topology_update_sysfs(); up_write(&topology_lock); - if (!ret) { + if (ret == 0) { sys_props.generation_count++; kfd_update_system_properties(); kfd_debug_print_topology(); @@ -1145,6 +1144,7 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) break; } up_write(&topology_lock); + return out_dev; } @@ -1182,40 +1182,17 @@ static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) { - struct kfd_iolink_properties *link, *cpu_link; - struct kfd_topology_device *cpu_dev; - uint32_t cap; - uint32_t cpu_flag = CRAT_IOLINK_FLAGS_ENABLED; - uint32_t flag = CRAT_IOLINK_FLAGS_ENABLED; + struct kfd_iolink_properties *link; if (!dev || !dev->gpu) return; - pcie_capability_read_dword(dev->gpu->pdev, - PCI_EXP_DEVCAP2, &cap); - - if (!(cap & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64))) - cpu_flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | - CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; - - if (!dev->gpu->pci_atomic_requested || - dev->gpu->device_info->asic_family == CHIP_HAWAII) - flag |= CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | + /* GPU only creates direck links so apply flags setting to all */ + if (dev->gpu->device_info->asic_family == CHIP_HAWAII) + list_for_each_entry(link, &dev->io_link_props, list) + link->flags = CRAT_IOLINK_FLAGS_ENABLED | + CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; - - /* GPU only creates direct links so apply flags setting to all */ - list_for_each_entry(link, &dev->io_link_props, list) { - link->flags = flag; - cpu_dev = kfd_topology_device_by_proximity_domain( - link->node_to); - if (cpu_dev) { - list_for_each_entry(cpu_link, - &cpu_dev->io_link_props, list) - if (cpu_link->node_to == link->node_from) - cpu_link->flags = cpu_flag; - } - } } int kfd_topology_add_device(struct kfd_dev *gpu) @@ -1235,7 +1212,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); - proximity_domain = atomic_inc_return(&topology_crat_proximity_domain); + proximity_domain = atomic_inc_return(& + topology_crat_proximity_domain); /* Check to see if this gpu device exists in the topology_device_list. * If so, assign the gpu to that device, @@ -1246,16 +1224,15 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev = kfd_assign_gpu(gpu); if (!dev) { res = kfd_create_crat_image_virtual(&crat_image, &image_size, - COMPUTE_UNIT_GPU, gpu, - proximity_domain); + COMPUTE_UNIT_GPU, + gpu, proximity_domain); if (res) { pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", gpu_id); return res; } res = kfd_parse_crat_table(crat_image, - &temp_topology_device_list, - proximity_domain); + &temp_topology_device_list, proximity_domain); if (res) { pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", gpu_id); @@ -1272,13 +1249,14 @@ int kfd_topology_add_device(struct kfd_dev *gpu) res = kfd_topology_update_sysfs(); up_write(&topology_lock); - if (!res) + if (res == 0) sys_props.generation_count++; else pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", gpu_id, res); dev = kfd_assign_gpu(gpu); - if (WARN_ON(!dev)) { + if (!dev) { + pr_err("Could not assign GPU\n"); res = -ENODEV; goto err; } @@ -1331,22 +1309,20 @@ int kfd_topology_add_device(struct kfd_dev *gpu) HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); break; case CHIP_VEGA10: - case CHIP_VEGA20: case CHIP_RAVEN: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); break; default: - WARN(1, "Unexpected ASIC family %u", - dev->gpu->device_info->asic_family); + BUG(); } /* Fix errors in CZ CRAT. - * simd_count: Carrizo CRAT reports wrong simd_count, probably - * because it doesn't consider masked out CUs - * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd - * capability flag: Carrizo CRAT doesn't report IOMMU flags + * simd_count: Carrizo CRAT reports wrong simd_count, probably because + * it doesn't consider masked out CUs + * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd. + * capability flag: Carrizo CRAT doesn't report IOMMU flags. */ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { dev->node_props.simd_count = @@ -1386,7 +1362,7 @@ int kfd_topology_remove_device(struct kfd_dev *gpu) up_write(&topology_lock); - if (!res) + if (res == 0) kfd_notify_gpu_change(gpu_id, 0); return res; @@ -1427,7 +1403,7 @@ static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask) { int first_cpu_of_numa_node; - if (!cpumask || cpumask == cpu_none_mask) + if (!cpumask || (cpumask == cpu_none_mask)) return -1; first_cpu_of_numa_node = cpumask_first(cpumask); if (first_cpu_of_numa_node >= nr_cpu_ids) @@ -1470,7 +1446,7 @@ int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data) seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); r = dqm_debugfs_hqds(m, dev->gpu->dqm); - if (r) + if (r != 0) break; } @@ -1495,7 +1471,7 @@ int kfd_debugfs_rls_by_device(struct seq_file *m, void *data) seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id); r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets); - if (r) + if (r != 0) break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 2b36baf..f4d29c4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -46,6 +46,9 @@ #define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 #define HSA_CAP_DOORBELL_TYPE_1_0 0x1 #define HSA_CAP_DOORBELL_TYPE_2_0 0x2 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 +#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 struct kfd_node_properties { @@ -166,9 +169,9 @@ struct kfd_topology_device { struct attribute attr_gpuid; struct attribute attr_name; struct attribute attr_props; - uint8_t oem_id[CRAT_OEMID_LENGTH]; - uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; - uint32_t oem_revision; + uint8_t oem_id[CRAT_OEMID_LENGTH]; + uint8_t oem_table_id[CRAT_OEMTABLEID_LENGTH]; + uint32_t oem_revision; }; struct kfd_system_properties { @@ -187,8 +190,4 @@ struct kfd_topology_device *kfd_create_topology_device( struct list_head *device_list); void kfd_release_topology_device_list(struct list_head *device_list); -extern bool amd_iommu_pc_supported(void); -extern u8 amd_iommu_pc_get_max_banks(u16 devid); -extern u8 amd_iommu_pc_get_max_counters(u16 devid); - #endif /* __KFD_TOPOLOGY_H__ */ diff --git a/drivers/gpu/drm/amd/amdkfd/soc15_int.h b/drivers/gpu/drm/amd/amdkfd/soc15_int.h index 0bc0b25..e00d03d 100644 --- a/drivers/gpu/drm/amd/amdkfd/soc15_int.h +++ b/drivers/gpu/drm/amd/amdkfd/soc15_int.h @@ -1,5 +1,5 @@ /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -22,8 +22,45 @@ #ifndef HSA_SOC15_INT_H_INCLUDED #define HSA_SOC15_INT_H_INCLUDED +/* + * vega10+ IH clients + */ +enum soc15_ih_client_id { + SOC15_IH_CLIENTID_IH = 0x00, + SOC15_IH_CLIENTID_ACP = 0x01, + SOC15_IH_CLIENTID_ATHUB = 0x02, + SOC15_IH_CLIENTID_BIF = 0x03, + SOC15_IH_CLIENTID_DCE = 0x04, + SOC15_IH_CLIENTID_ISP = 0x05, + SOC15_IH_CLIENTID_PCIE0 = 0x06, + SOC15_IH_CLIENTID_RLC = 0x07, + SOC15_IH_CLIENTID_SDMA0 = 0x08, + SOC15_IH_CLIENTID_SDMA1 = 0x09, + SOC15_IH_CLIENTID_SE0SH = 0x0a, + SOC15_IH_CLIENTID_SE1SH = 0x0b, + SOC15_IH_CLIENTID_SE2SH = 0x0c, + SOC15_IH_CLIENTID_SE3SH = 0x0d, + SOC15_IH_CLIENTID_SYSHUB = 0x0e, + SOC15_IH_CLIENTID_THM = 0x0f, + SOC15_IH_CLIENTID_UVD = 0x10, + SOC15_IH_CLIENTID_VCE0 = 0x11, + SOC15_IH_CLIENTID_VMC = 0x12, + SOC15_IH_CLIENTID_XDMA = 0x13, + SOC15_IH_CLIENTID_GRBM_CP = 0x14, + SOC15_IH_CLIENTID_ATS = 0x15, + SOC15_IH_CLIENTID_ROM_SMUIO = 0x16, + SOC15_IH_CLIENTID_DF = 0x17, + SOC15_IH_CLIENTID_VCE1 = 0x18, + SOC15_IH_CLIENTID_PWR = 0x19, + SOC15_IH_CLIENTID_UTCL2 = 0x1b, + SOC15_IH_CLIENTID_EA = 0x1c, + SOC15_IH_CLIENTID_UTCL2LOG = 0x1d, + SOC15_IH_CLIENTID_MP0 = 0x1e, + SOC15_IH_CLIENTID_MP1 = 0x1f, + + SOC15_IH_CLIENTID_MAX +}; -#include "soc15_ih_clientid.h" #define SOC15_INTSRC_CP_END_OF_PIPE 181 #define SOC15_INTSRC_CP_BAD_OPCODE 183 -- 2.7.4