diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch new file mode 100644 index 00000000..25e12af3 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch @@ -0,0 +1,233 @@ +From 6177b4c1237ec7719ddd93a59f9ac50dd8580d4d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> +Date: Wed, 7 Nov 2018 13:55:01 +0100 +Subject: [PATCH 1617/2940] drm/amdgpu: use ring/hash for fault handling on + GMC9 v3 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Further testing showed that the idea with the chash doesn't work as expected. +Especially we can't predict when we can remove the entries from the hash again. + +So replace the chash with a ring buffer/hash mix where entries in the container +age automatically based on their timestamp. + +v2: use ring buffer / hash mix +v3: check the timeout to make sure all entries age + +Signed-off-by: Christian König <christian.koenig@amd.com> +Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v2) +Signed-off-by: Chaudhary Amit Kumar <Chaudharyamit.Kumar@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 55 +++++++++++++++++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++++++++++++++ + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 60 ++----------------------- + 3 files changed, 92 insertions(+), 57 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +index 5a32a0d2ad31..250d9212cc38 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +@@ -240,3 +240,58 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc) + dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n", + mc->agp_size >> 20, mc->agp_start, mc->agp_end); + } ++ ++/** ++ * amdgpu_gmc_filter_faults - filter VM faults ++ * ++ * @adev: amdgpu device structure ++ * @addr: address of the VM fault ++ * @pasid: PASID of the process causing the fault ++ * @timestamp: timestamp of the fault ++ * ++ * Returns: ++ * True if the fault was filtered and should not be processed further. ++ * False if the fault is a new one and needs to be handled. ++ */ ++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, ++ uint16_t pasid, uint64_t timestamp) ++{ ++ struct amdgpu_gmc *gmc = &adev->gmc; ++ ++ uint64_t stamp, key = addr << 4 | pasid; ++ struct amdgpu_gmc_fault *fault; ++ uint32_t hash; ++ ++ /* If we don't have space left in the ring buffer return immediately */ ++ stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) - ++ AMDGPU_GMC_FAULT_TIMEOUT; ++ if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) ++ return true; ++ ++ /* Try to find the fault in the hash */ ++ hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER); ++ fault = &gmc->fault_ring[gmc->fault_hash[hash].idx]; ++ while (fault->timestamp >= stamp) { ++ uint64_t tmp; ++ ++ if (fault->key == key) ++ return true; ++ ++ tmp = fault->timestamp; ++ fault = &gmc->fault_ring[fault->next]; ++ ++ /* Check if the entry was reused */ ++ if (fault->timestamp >= tmp) ++ break; ++ } ++ ++ /* Add the fault to the ring */ ++ fault = &gmc->fault_ring[gmc->last_fault]; ++ fault->key = key; ++ fault->timestamp = timestamp; ++ ++ /* And update the hash */ ++ fault->next = gmc->fault_hash[hash].idx; ++ gmc->fault_hash[hash].idx = gmc->last_fault++; ++ return false; ++} +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +index 2d63ec729bec..13479c3e6bfb 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +@@ -43,8 +43,34 @@ + */ + #define AMDGPU_GMC_HOLE_MASK 0x0000ffffffffffffULL + ++/* ++ * Ring size as power of two for the log of recent faults. ++ */ ++#define AMDGPU_GMC_FAULT_RING_ORDER 8 ++#define AMDGPU_GMC_FAULT_RING_SIZE (1 << AMDGPU_GMC_FAULT_RING_ORDER) ++ ++/* ++ * Hash size as power of two for the log of recent faults ++ */ ++#define AMDGPU_GMC_FAULT_HASH_ORDER 8 ++#define AMDGPU_GMC_FAULT_HASH_SIZE (1 << AMDGPU_GMC_FAULT_HASH_ORDER) ++ ++/* ++ * Number of IH timestamp ticks until a fault is considered handled ++ */ ++#define AMDGPU_GMC_FAULT_TIMEOUT 5000ULL ++ + struct firmware; + ++/* ++ * GMC page fault information ++ */ ++struct amdgpu_gmc_fault { ++ uint64_t timestamp; ++ uint64_t next:AMDGPU_GMC_FAULT_RING_ORDER; ++ uint64_t key:52; ++}; ++ + /* + * VMHUB structures, functions & helpers + */ +@@ -141,6 +167,12 @@ struct amdgpu_gmc { + struct kfd_vm_fault_info *vm_fault_info; + atomic_t vm_fault_info_updated; + ++ struct amdgpu_gmc_fault fault_ring[AMDGPU_GMC_FAULT_RING_SIZE]; ++ struct { ++ uint64_t idx:AMDGPU_GMC_FAULT_RING_ORDER; ++ } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE]; ++ uint64_t last_fault:AMDGPU_GMC_FAULT_RING_ORDER; ++ + const struct amdgpu_gmc_funcs *gmc_funcs; + + struct amdgpu_xgmi xgmi; +@@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev, + struct amdgpu_gmc *mc); + void amdgpu_gmc_agp_location(struct amdgpu_device *adev, + struct amdgpu_gmc *mc); ++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, ++ uint16_t pasid, uint64_t timestamp); + + #endif +diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +index ad5de0e99794..593f93d5feb2 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +@@ -300,62 +300,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev, + return 0; + } + +-/** +- * vega10_ih_prescreen_iv - prescreen an interrupt vector +- * +- * @adev: amdgpu_device pointer +- * +- * Returns true if the interrupt vector should be further processed. +- */ +-static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev, +- struct amdgpu_iv_entry *entry, +- uint64_t addr) +-{ +- struct amdgpu_vm *vm; +- u64 key; +- int r; +- +- /* No PASID, can't identify faulting process */ +- if (!entry->pasid) +- return true; +- +- /* Not a retry fault */ +- if (!(entry->src_data[1] & 0x80)) +- return true; +- +- /* Track retry faults in per-VM fault FIFO. */ +- spin_lock(&adev->vm_manager.pasid_lock); +- vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid); +- if (!vm) { +- /* VM not found, process it normally */ +- spin_unlock(&adev->vm_manager.pasid_lock); +- return true; +- } +- +- key = AMDGPU_VM_FAULT(entry->pasid, addr); +- r = amdgpu_vm_add_fault(vm->fault_hash, key); +- +- /* Hash table is full or the fault is already being processed, +- * ignore further page faults +- */ +- if (r != 0) { +- spin_unlock(&adev->vm_manager.pasid_lock); +- return false; +- } +- /* No locking required with single writer and single reader */ +- r = kfifo_put(&vm->faults, key); +- if (!r) { +- /* FIFO is full. Ignore it until there is space */ +- amdgpu_vm_clear_fault(vm->fault_hash, key); +- spin_unlock(&adev->vm_manager.pasid_lock); +- return false; +- } +- +- spin_unlock(&adev->vm_manager.pasid_lock); +- /* It's the first fault for this address, process it normally */ +- return true; +-} +- + static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +@@ -368,9 +312,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, + addr = (u64)entry->src_data[0] << 12; + addr |= ((u64)entry->src_data[1] & 0xf) << 44; + +- if (!gmc_v9_0_prescreen_iv(adev, entry, addr)) ++ if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid, ++ entry->timestamp)) + return 1; /* This also prevents sending it to KFD */ + ++ /* If it's the first fault for this address, process it normally */ + if (!amdgpu_sriov_vf(adev)) { + status = RREG32(hub->vm_l2_pro_fault_status); + WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); +-- +2.17.1 + |