aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch253
1 files changed, 253 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch
new file mode 100644
index 00000000..b5ebb468
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch
@@ -0,0 +1,253 @@
+From ca81bd6d5a9d81d0aab4cfb780f2d401d8c5e85a Mon Sep 17 00:00:00 2001
+From: Felix Kuehling <Felix.Kuehling@amd.com>
+Date: Thu, 21 Sep 2017 16:26:41 -0400
+Subject: [PATCH 1900/4131] drm/amdgpu: Handle GPUVM fault storms
+
+When many wavefronts cause VM faults at the same time, it can
+overwhelm the interrupt handler and cause IH ring overflows before
+the driver can notify or kill the faulting application.
+
+As a workaround I'm introducing limited per-VM fault credit. After
+that number of VM faults have occurred, further VM faults are
+filtered out at the prescreen stage of processing.
+
+This depends on the PASID in the interrupt packet, so it currently
+only works for KFD contexts.
+
+Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 +++++++++++++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++-
+ drivers/gpu/drm/amd/amdgpu/cik_ih.c | 19 +++++++++++++++++--
+ drivers/gpu/drm/amd/amdgpu/cz_ih.c | 19 +++++++++++++++++--
+ drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++--
+ drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 19 +++++++++++++++++--
+ drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 11 +++++++----
+ 7 files changed, 112 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+index c542eef..3675c2f 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+@@ -2727,6 +2727,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ }
+
+ INIT_KFIFO(vm->faults);
++ vm->fault_credit = 16;
+
+ return 0;
+
+@@ -2843,6 +2844,36 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+ }
+
+ /**
++ * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
++ *
++ * @adev: amdgpu_device pointer
++ * @pasid: PASID do identify the VM
++ *
++ * This function is expected to be called in interrupt context. Returns
++ * true if there was fault credit, false otherwise
++ */
++bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
++ unsigned int pasid)
++{
++ struct amdgpu_vm *vm;
++
++ spin_lock(&adev->vm_manager.pasid_lock);
++ vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
++ spin_unlock(&adev->vm_manager.pasid_lock);
++ if (!vm)
++ /* VM not found, can't track fault credit */
++ return true;
++
++ /* No lock needed. only accessed by IRQ handler */
++ if (!vm->fault_credit)
++ /* Too many faults in this VM */
++ return false;
++
++ vm->fault_credit--;
++ return true;
++}
++
++/**
+ * amdgpu_vm_manager_init - init the VM manager
+ *
+ * @adev: amdgpu_device pointer
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+index 2056f99..1a41459 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+@@ -168,8 +168,11 @@ struct amdgpu_vm {
+ /* Flag to indicate ATS support from PTE for GFX9 */
+ bool pte_support_ats;
+
+- /* Up to 128 pending page faults */
++ /* Up to 128 pending retry page faults */
+ DECLARE_KFIFO(faults, u64, 128);
++
++ /* Limit non-retry fault storms */
++ unsigned int fault_credit;
+ };
+
+ struct amdgpu_vm_id {
+@@ -249,6 +252,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
+ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ int vm_context, unsigned int pasid);
+ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
++bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
++ unsigned int pasid);
+ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
+ struct list_head *validated,
+ struct amdgpu_bo_list_entry *entry);
+diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
+index 07d3d89..a870b35 100644
+--- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
++++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
+@@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev)
+ */
+ static bool cik_ih_prescreen_iv(struct amdgpu_device *adev)
+ {
+- /* Process all interrupts */
+- return true;
++ u32 ring_index = adev->irq.ih.rptr >> 2;
++ u16 pasid;
++
++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
++ case 146:
++ case 147:
++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
++ return true;
++ break;
++ default:
++ /* Not a VM fault */
++ return true;
++ }
++
++ adev->irq.ih.rptr += 16;
++ return false;
+ }
+
+ /**
+diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+index b6cdf4a..fa61d64 100644
+--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
++++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
+@@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev)
+ */
+ static bool cz_ih_prescreen_iv(struct amdgpu_device *adev)
+ {
+- /* Process all interrupts */
+- return true;
++ u32 ring_index = adev->irq.ih.rptr >> 2;
++ u16 pasid;
++
++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
++ case 146:
++ case 147:
++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
++ return true;
++ break;
++ default:
++ /* Not a VM fault */
++ return true;
++ }
++
++ adev->irq.ih.rptr += 16;
++ return false;
+ }
+
+ /**
+diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
+index 65ed6d3..bd592cb 100644
+--- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
++++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
+@@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev)
+ */
+ static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev)
+ {
+- /* Process all interrupts */
+- return true;
++ u32 ring_index = adev->irq.ih.rptr >> 2;
++ u16 pasid;
++
++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
++ case 146:
++ case 147:
++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
++ return true;
++ break;
++ default:
++ /* Not a VM fault */
++ return true;
++ }
++
++ adev->irq.ih.rptr += 16;
++ return false;
+ }
+
+ /**
+diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+index 5ed0069..aa4e320 100644
+--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
++++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
+@@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev)
+ */
+ static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev)
+ {
+- /* Process all interrupts */
+- return true;
++ u32 ring_index = adev->irq.ih.rptr >> 2;
++ u16 pasid;
++
++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
++ case 146:
++ case 147:
++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
++ return true;
++ break;
++ default:
++ /* Not a VM fault */
++ return true;
++ }
++
++ adev->irq.ih.rptr += 16;
++ return false;
+ }
+
+ /**
+diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+index a3b30d8..6973257 100644
+--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
++++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+@@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev)
+ return true;
+ }
+
+- /* Not a retry fault */
+- if (!(dw5 & 0x80))
+- return true;
+-
+ pasid = dw3 & 0xffff;
+ /* No PASID, can't identify faulting process */
+ if (!pasid)
+ return true;
+
++ /* Not a retry fault, check fault credit */
++ if (!(dw5 & 0x80)) {
++ if (!amdgpu_vm_pasid_fault_credit(adev, pasid))
++ goto ignore_iv;
++ return true;
++ }
++
+ addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
+ key = AMDGPU_VM_FAULT(pasid, addr);
+ r = amdgpu_ih_add_fault(adev, key);
+--
+2.7.4
+