aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch233
1 files changed, 233 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch
new file mode 100644
index 00000000..25e12af3
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch
@@ -0,0 +1,233 @@
+From 6177b4c1237ec7719ddd93a59f9ac50dd8580d4d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
+Date: Wed, 7 Nov 2018 13:55:01 +0100
+Subject: [PATCH 1617/2940] drm/amdgpu: use ring/hash for fault handling on
+ GMC9 v3
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Further testing showed that the idea with the chash doesn't work as expected.
+Especially we can't predict when we can remove the entries from the hash again.
+
+So replace the chash with a ring buffer/hash mix where entries in the container
+age automatically based on their timestamp.
+
+v2: use ring buffer / hash mix
+v3: check the timeout to make sure all entries age
+
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v2)
+Signed-off-by: Chaudhary Amit Kumar <Chaudharyamit.Kumar@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 55 +++++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 60 ++-----------------------
+ 3 files changed, 92 insertions(+), 57 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+index 5a32a0d2ad31..250d9212cc38 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+@@ -240,3 +240,58 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
+ dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
+ mc->agp_size >> 20, mc->agp_start, mc->agp_end);
+ }
++
++/**
++ * amdgpu_gmc_filter_faults - filter VM faults
++ *
++ * @adev: amdgpu device structure
++ * @addr: address of the VM fault
++ * @pasid: PASID of the process causing the fault
++ * @timestamp: timestamp of the fault
++ *
++ * Returns:
++ * True if the fault was filtered and should not be processed further.
++ * False if the fault is a new one and needs to be handled.
++ */
++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
++ uint16_t pasid, uint64_t timestamp)
++{
++ struct amdgpu_gmc *gmc = &adev->gmc;
++
++ uint64_t stamp, key = addr << 4 | pasid;
++ struct amdgpu_gmc_fault *fault;
++ uint32_t hash;
++
++ /* If we don't have space left in the ring buffer return immediately */
++ stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
++ AMDGPU_GMC_FAULT_TIMEOUT;
++ if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
++ return true;
++
++ /* Try to find the fault in the hash */
++ hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
++ fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
++ while (fault->timestamp >= stamp) {
++ uint64_t tmp;
++
++ if (fault->key == key)
++ return true;
++
++ tmp = fault->timestamp;
++ fault = &gmc->fault_ring[fault->next];
++
++ /* Check if the entry was reused */
++ if (fault->timestamp >= tmp)
++ break;
++ }
++
++ /* Add the fault to the ring */
++ fault = &gmc->fault_ring[gmc->last_fault];
++ fault->key = key;
++ fault->timestamp = timestamp;
++
++ /* And update the hash */
++ fault->next = gmc->fault_hash[hash].idx;
++ gmc->fault_hash[hash].idx = gmc->last_fault++;
++ return false;
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+index 2d63ec729bec..13479c3e6bfb 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+@@ -43,8 +43,34 @@
+ */
+ #define AMDGPU_GMC_HOLE_MASK 0x0000ffffffffffffULL
+
++/*
++ * Ring size as power of two for the log of recent faults.
++ */
++#define AMDGPU_GMC_FAULT_RING_ORDER 8
++#define AMDGPU_GMC_FAULT_RING_SIZE (1 << AMDGPU_GMC_FAULT_RING_ORDER)
++
++/*
++ * Hash size as power of two for the log of recent faults
++ */
++#define AMDGPU_GMC_FAULT_HASH_ORDER 8
++#define AMDGPU_GMC_FAULT_HASH_SIZE (1 << AMDGPU_GMC_FAULT_HASH_ORDER)
++
++/*
++ * Number of IH timestamp ticks until a fault is considered handled
++ */
++#define AMDGPU_GMC_FAULT_TIMEOUT 5000ULL
++
+ struct firmware;
+
++/*
++ * GMC page fault information
++ */
++struct amdgpu_gmc_fault {
++ uint64_t timestamp;
++ uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER;
++ uint64_t key:52;
++};
++
+ /*
+ * VMHUB structures, functions & helpers
+ */
+@@ -141,6 +167,12 @@ struct amdgpu_gmc {
+ struct kfd_vm_fault_info *vm_fault_info;
+ atomic_t vm_fault_info_updated;
+
++ struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
++ struct {
++ uint64_t idx:AMDGPU_GMC_FAULT_RING_ORDER;
++ } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
++ uint64_t last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
++
+ const struct amdgpu_gmc_funcs *gmc_funcs;
+
+ struct amdgpu_xgmi xgmi;
+@@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
+ struct amdgpu_gmc *mc);
+ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
+ struct amdgpu_gmc *mc);
++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
++ uint16_t pasid, uint64_t timestamp);
+
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+index ad5de0e99794..593f93d5feb2 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+@@ -300,62 +300,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
+ return 0;
+ }
+
+-/**
+- * vega10_ih_prescreen_iv - prescreen an interrupt vector
+- *
+- * @adev: amdgpu_device pointer
+- *
+- * Returns true if the interrupt vector should be further processed.
+- */
+-static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev,
+- struct amdgpu_iv_entry *entry,
+- uint64_t addr)
+-{
+- struct amdgpu_vm *vm;
+- u64 key;
+- int r;
+-
+- /* No PASID, can't identify faulting process */
+- if (!entry->pasid)
+- return true;
+-
+- /* Not a retry fault */
+- if (!(entry->src_data[1] & 0x80))
+- return true;
+-
+- /* Track retry faults in per-VM fault FIFO. */
+- spin_lock(&adev->vm_manager.pasid_lock);
+- vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid);
+- if (!vm) {
+- /* VM not found, process it normally */
+- spin_unlock(&adev->vm_manager.pasid_lock);
+- return true;
+- }
+-
+- key = AMDGPU_VM_FAULT(entry->pasid, addr);
+- r = amdgpu_vm_add_fault(vm->fault_hash, key);
+-
+- /* Hash table is full or the fault is already being processed,
+- * ignore further page faults
+- */
+- if (r != 0) {
+- spin_unlock(&adev->vm_manager.pasid_lock);
+- return false;
+- }
+- /* No locking required with single writer and single reader */
+- r = kfifo_put(&vm->faults, key);
+- if (!r) {
+- /* FIFO is full. Ignore it until there is space */
+- amdgpu_vm_clear_fault(vm->fault_hash, key);
+- spin_unlock(&adev->vm_manager.pasid_lock);
+- return false;
+- }
+-
+- spin_unlock(&adev->vm_manager.pasid_lock);
+- /* It's the first fault for this address, process it normally */
+- return true;
+-}
+-
+ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+@@ -368,9 +312,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
+ addr = (u64)entry->src_data[0] << 12;
+ addr |= ((u64)entry->src_data[1] & 0xf) << 44;
+
+- if (!gmc_v9_0_prescreen_iv(adev, entry, addr))
++ if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid,
++ entry->timestamp))
+ return 1; /* This also prevents sending it to KFD */
+
++ /* If it's the first fault for this address, process it normally */
+ if (!amdgpu_sriov_vf(adev)) {
+ status = RREG32(hub->vm_l2_pro_fault_status);
+ WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+--
+2.17.1
+