1 files changed, 233 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch
new file mode 100644
index 00000000..25e12af3
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/1617-drm-amdgpu-use-ring-hash-for-fault-handling-on-GMC9-.patch
@@ -0,0 +1,233 @@
+From 6177b4c1237ec7719ddd93a59f9ac50dd8580d4d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
+Date: Wed, 7 Nov 2018 13:55:01 +0100
+Subject: [PATCH 1617/2940] drm/amdgpu: use ring/hash for fault handling on
+ GMC9 v3
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Further testing showed that the idea with the chash doesn't work as expected.
+Especially we can't predict when we can remove the entries from the hash again.
+
+So replace the chash with a ring buffer/hash mix where entries in the container
+age automatically based on their timestamp.
+
+v2: use ring buffer / hash mix
+v3: check the timeout to make sure all entries age
+
+Signed-off-by: Christian König <christian.koenig@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v2)
+Signed-off-by: Chaudhary Amit Kumar <Chaudharyamit.Kumar@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 55 +++++++++++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 34 ++++++++++++++
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 60 ++-----------------------
+ 3 files changed, 92 insertions(+), 57 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+index 5a32a0d2ad31..250d9212cc38 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+@@ -240,3 +240,58 @@ void amdgpu_gmc_agp_location(struct amdgpu_device *adev, struct amdgpu_gmc *mc)
+ 	dev_info(adev->dev, "AGP: %lluM 0x%016llX - 0x%016llX\n",
+ 			mc->agp_size >> 20, mc->agp_start, mc->agp_end);
+ }
++
++/**
++ * amdgpu_gmc_filter_faults - filter VM faults
++ *
++ * @adev: amdgpu device structure
++ * @addr: address of the VM fault
++ * @pasid: PASID of the process causing the fault
++ * @timestamp: timestamp of the fault
++ *
++ * Returns:
++ * True if the fault was filtered and should not be processed further.
++ * False if the fault is a new one and needs to be handled.
++ */
++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
++			      uint16_t pasid, uint64_t timestamp)
++{
++	struct amdgpu_gmc *gmc = &adev->gmc;
++
++	uint64_t stamp, key = addr << 4 | pasid;
++	struct amdgpu_gmc_fault *fault;
++	uint32_t hash;
++
++	/* If we don't have space left in the ring buffer return immediately */
++	stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
++		AMDGPU_GMC_FAULT_TIMEOUT;
++	if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
++		return true;
++
++	/* Try to find the fault in the hash */
++	hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
++	fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
++	while (fault->timestamp >= stamp) {
++		uint64_t tmp;
++
++		if (fault->key == key)
++			return true;
++
++		tmp = fault->timestamp;
++		fault = &gmc->fault_ring[fault->next];
++
++		/* Check if the entry was reused */
++		if (fault->timestamp >= tmp)
++			break;
++	}
++
++	/* Add the fault to the ring */
++	fault = &gmc->fault_ring[gmc->last_fault];
++	fault->key = key;
++	fault->timestamp = timestamp;
++
++	/* And update the hash */
++	fault->next = gmc->fault_hash[hash].idx;
++	gmc->fault_hash[hash].idx = gmc->last_fault++;
++	return false;
++}
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+index 2d63ec729bec..13479c3e6bfb 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+@@ -43,8 +43,34 @@
+  */
+ #define AMDGPU_GMC_HOLE_MASK	0x0000ffffffffffffULL
+ 
++/*
++ * Ring size as power of two for the log of recent faults.
++ */
++#define AMDGPU_GMC_FAULT_RING_ORDER	8
++#define AMDGPU_GMC_FAULT_RING_SIZE	(1 << AMDGPU_GMC_FAULT_RING_ORDER)
++
++/*
++ * Hash size as power of two for the log of recent faults
++ */
++#define AMDGPU_GMC_FAULT_HASH_ORDER	8
++#define AMDGPU_GMC_FAULT_HASH_SIZE	(1 << AMDGPU_GMC_FAULT_HASH_ORDER)
++
++/*
++ * Number of IH timestamp ticks until a fault is considered handled
++ */
++#define AMDGPU_GMC_FAULT_TIMEOUT	5000ULL
++
+ struct firmware;
+ 
++/*
++ * GMC page fault information
++ */
++struct amdgpu_gmc_fault {
++	uint64_t	timestamp;
++	uint64_t	next:AMDGPU_GMC_FAULT_RING_ORDER;
++	uint64_t	key:52;
++};
++
+ /*
+  * VMHUB structures, functions & helpers
+  */
+@@ -141,6 +167,12 @@ struct amdgpu_gmc {
+ 	struct kfd_vm_fault_info *vm_fault_info;
+ 	atomic_t                vm_fault_info_updated;
+ 
++	struct amdgpu_gmc_fault	fault_ring[AMDGPU_GMC_FAULT_RING_SIZE];
++	struct {
++		uint64_t	idx:AMDGPU_GMC_FAULT_RING_ORDER;
++	} fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
++	uint64_t		last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
++
+ 	const struct amdgpu_gmc_funcs	*gmc_funcs;
+ 
+ 	struct amdgpu_xgmi xgmi;
+@@ -195,5 +227,7 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
+ 			      struct amdgpu_gmc *mc);
+ void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
+ 			     struct amdgpu_gmc *mc);
++bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
++			      uint16_t pasid, uint64_t timestamp);
+ 
+ #endif
+diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+index ad5de0e99794..593f93d5feb2 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+@@ -300,62 +300,6 @@ static int gmc_v9_0_vm_fault_interrupt_state(struct amdgpu_device *adev,
+ 	return 0;
+ }
+ 
+-/**
+- * vega10_ih_prescreen_iv - prescreen an interrupt vector
+- *
+- * @adev: amdgpu_device pointer
+- *
+- * Returns true if the interrupt vector should be further processed.
+- */
+-static bool gmc_v9_0_prescreen_iv(struct amdgpu_device *adev,
+-				  struct amdgpu_iv_entry *entry,
+-				  uint64_t addr)
+-{
+-	struct amdgpu_vm *vm;
+-	u64 key;
+-	int r;
+-
+-	/* No PASID, can't identify faulting process */
+-	if (!entry->pasid)
+-		return true;
+-
+-	/* Not a retry fault */
+-	if (!(entry->src_data[1] & 0x80))
+-		return true;
+-
+-	/* Track retry faults in per-VM fault FIFO. */
+-	spin_lock(&adev->vm_manager.pasid_lock);
+-	vm = idr_find(&adev->vm_manager.pasid_idr, entry->pasid);
+-	if (!vm) {
+-		/* VM not found, process it normally */
+-		spin_unlock(&adev->vm_manager.pasid_lock);
+-		return true;
+-	}
+-
+-	key = AMDGPU_VM_FAULT(entry->pasid, addr);
+-	r = amdgpu_vm_add_fault(vm->fault_hash, key);
+-
+-	/* Hash table is full or the fault is already being processed,
+-	 * ignore further page faults
+-	 */
+-	if (r != 0) {
+-		spin_unlock(&adev->vm_manager.pasid_lock);
+-		return false;
+-	}
+-	/* No locking required with single writer and single reader */
+-	r = kfifo_put(&vm->faults, key);
+-	if (!r) {
+-		/* FIFO is full. Ignore it until there is space */
+-		amdgpu_vm_clear_fault(vm->fault_hash, key);
+-		spin_unlock(&adev->vm_manager.pasid_lock);
+-		return false;
+-	}
+-
+-	spin_unlock(&adev->vm_manager.pasid_lock);
+-	/* It's the first fault for this address, process it normally */
+-	return true;
+-}
+-
+ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
+ 				struct amdgpu_irq_src *source,
+ 				struct amdgpu_iv_entry *entry)
+@@ -368,9 +312,11 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
+ 	addr = (u64)entry->src_data[0] << 12;
+ 	addr |= ((u64)entry->src_data[1] & 0xf) << 44;
+ 
+-	if (!gmc_v9_0_prescreen_iv(adev, entry, addr))
++	if (retry_fault && amdgpu_gmc_filter_faults(adev, addr, entry->pasid,
++						    entry->timestamp))
+ 		return 1; /* This also prevents sending it to KFD */
+ 
++	/* If it's the first fault for this address, process it normally */
+ 	if (!amdgpu_sriov_vf(adev)) {
+ 		status = RREG32(hub->vm_l2_pro_fault_status);
+ 		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
+-- 
+2.17.1
+