diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch new file mode 100644 index 00000000..b5ebb468 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1900-drm-amdgpu-Handle-GPUVM-fault-storms.patch @@ -0,0 +1,253 @@ +From ca81bd6d5a9d81d0aab4cfb780f2d401d8c5e85a Mon Sep 17 00:00:00 2001 +From: Felix Kuehling <Felix.Kuehling@amd.com> +Date: Thu, 21 Sep 2017 16:26:41 -0400 +Subject: [PATCH 1900/4131] drm/amdgpu: Handle GPUVM fault storms + +When many wavefronts cause VM faults at the same time, it can +overwhelm the interrupt handler and cause IH ring overflows before +the driver can notify or kill the faulting application. + +As a workaround I'm introducing limited per-VM fault credit. After +that number of VM faults have occurred, further VM faults are +filtered out at the prescreen stage of processing. + +This depends on the PASID in the interrupt packet, so it currently +only works for KFD contexts. + +Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> +Reviewed-by: Alex Deucher <alexander.deucher@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 +++++++++++++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++- + drivers/gpu/drm/amd/amdgpu/cik_ih.c | 19 +++++++++++++++++-- + drivers/gpu/drm/amd/amdgpu/cz_ih.c | 19 +++++++++++++++++-- + drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++-- + drivers/gpu/drm/amd/amdgpu/tonga_ih.c | 19 +++++++++++++++++-- + drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 11 +++++++---- + 7 files changed, 112 insertions(+), 13 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +index c542eef..3675c2f 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +@@ -2727,6 +2727,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, + } + + INIT_KFIFO(vm->faults); ++ vm->fault_credit = 16; + + return 0; + +@@ -2843,6 +2844,36 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) + } + + /** ++ * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID ++ * ++ * @adev: amdgpu_device pointer ++ * @pasid: PASID do identify the VM ++ * ++ * This function is expected to be called in interrupt context. Returns ++ * true if there was fault credit, false otherwise ++ */ ++bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, ++ unsigned int pasid) ++{ ++ struct amdgpu_vm *vm; ++ ++ spin_lock(&adev->vm_manager.pasid_lock); ++ vm = idr_find(&adev->vm_manager.pasid_idr, pasid); ++ spin_unlock(&adev->vm_manager.pasid_lock); ++ if (!vm) ++ /* VM not found, can't track fault credit */ ++ return true; ++ ++ /* No lock needed. only accessed by IRQ handler */ ++ if (!vm->fault_credit) ++ /* Too many faults in this VM */ ++ return false; ++ ++ vm->fault_credit--; ++ return true; ++} ++ ++/** + * amdgpu_vm_manager_init - init the VM manager + * + * @adev: amdgpu_device pointer +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +index 2056f99..1a41459 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +@@ -168,8 +168,11 @@ struct amdgpu_vm { + /* Flag to indicate ATS support from PTE for GFX9 */ + bool pte_support_ats; + +- /* Up to 128 pending page faults */ ++ /* Up to 128 pending retry page faults */ + DECLARE_KFIFO(faults, u64, 128); ++ ++ /* Limit non-retry fault storms */ ++ unsigned int fault_credit; + }; + + struct amdgpu_vm_id { +@@ -249,6 +252,8 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev); + int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, + int vm_context, unsigned int pasid); + void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); ++bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, ++ unsigned int pasid); + void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, + struct list_head *validated, + struct amdgpu_bo_list_entry *entry); +diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c +index 07d3d89..a870b35 100644 +--- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c +@@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device *adev) + */ + static bool cik_ih_prescreen_iv(struct amdgpu_device *adev) + { +- /* Process all interrupts */ +- return true; ++ u32 ring_index = adev->irq.ih.rptr >> 2; ++ u16 pasid; ++ ++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { ++ case 146: ++ case 147: ++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; ++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) ++ return true; ++ break; ++ default: ++ /* Not a VM fault */ ++ return true; ++ } ++ ++ adev->irq.ih.rptr += 16; ++ return false; + } + + /** +diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c +index b6cdf4a..fa61d64 100644 +--- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c +@@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device *adev) + */ + static bool cz_ih_prescreen_iv(struct amdgpu_device *adev) + { +- /* Process all interrupts */ +- return true; ++ u32 ring_index = adev->irq.ih.rptr >> 2; ++ u16 pasid; ++ ++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { ++ case 146: ++ case 147: ++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; ++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) ++ return true; ++ break; ++ default: ++ /* Not a VM fault */ ++ return true; ++ } ++ ++ adev->irq.ih.rptr += 16; ++ return false; + } + + /** +diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +index 65ed6d3..bd592cb 100644 +--- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +@@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct amdgpu_device *adev) + */ + static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev) + { +- /* Process all interrupts */ +- return true; ++ u32 ring_index = adev->irq.ih.rptr >> 2; ++ u16 pasid; ++ ++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { ++ case 146: ++ case 147: ++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; ++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) ++ return true; ++ break; ++ default: ++ /* Not a VM fault */ ++ return true; ++ } ++ ++ adev->irq.ih.rptr += 16; ++ return false; + } + + /** +diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +index 5ed0069..aa4e320 100644 +--- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +@@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct amdgpu_device *adev) + */ + static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev) + { +- /* Process all interrupts */ +- return true; ++ u32 ring_index = adev->irq.ih.rptr >> 2; ++ u16 pasid; ++ ++ switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) { ++ case 146: ++ case 147: ++ pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16; ++ if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid)) ++ return true; ++ break; ++ default: ++ /* Not a VM fault */ ++ return true; ++ } ++ ++ adev->irq.ih.rptr += 16; ++ return false; + } + + /** +diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +index a3b30d8..6973257 100644 +--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c ++++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c +@@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct amdgpu_device *adev) + return true; + } + +- /* Not a retry fault */ +- if (!(dw5 & 0x80)) +- return true; +- + pasid = dw3 & 0xffff; + /* No PASID, can't identify faulting process */ + if (!pasid) + return true; + ++ /* Not a retry fault, check fault credit */ ++ if (!(dw5 & 0x80)) { ++ if (!amdgpu_vm_pasid_fault_credit(adev, pasid)) ++ goto ignore_iv; ++ return true; ++ } ++ + addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12); + key = AMDGPU_VM_FAULT(pasid, addr); + r = amdgpu_ih_add_fault(adev, key); +-- +2.7.4 + |