aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/2131-drm-amdkfd-fix-hang-issue-if-KFD-memory-failed-to-re.patch
blob: bf2ad659b91806dd48a18d2774f59d68ee9bb9ee (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
From b7cfd306079e1746315d6d2ad5c837623a07cbaf Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Fri, 20 Oct 2017 14:32:27 -0400
Subject: [PATCH 2131/4131] drm/amdkfd: fix hang issue if KFD memory failed to
 restore

In current logic, if restore failed, the restore work is rescheduled,
and new fence is created for process->eviction_fence, but the new fence
will attach to BOs created before the next restore, as a result, next
restore will start evict work because new fence is not signaled while
ttm_bo_wait on those new BOs. The eviction count can detect and skip
eviction but it schedules restore work again, this is recursive
eviction/restore.

The fix is to create new fence only after restore is finished completely.

BUG: SWDEV-134782

Change-Id: Ib2f25f8a50a5eb8247572a105085482dc7326733
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 30 +++++++++++++-----------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 900019d..5de8398 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2287,7 +2287,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 	struct amdkfd_vm *peer_vm;
 	struct kgd_mem *mem;
 	struct bo_vm_reservation_context ctx;
-	struct amdgpu_amdkfd_fence *old_fence;
+	struct amdgpu_amdkfd_fence *new_fence;
 	int ret = 0, i;
 	struct list_head duplicate_save;
 	struct amdgpu_sync sync_obj;
@@ -2302,19 +2302,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 	if (pd_bo_list == NULL)
 		return -ENOMEM;
 
-	/* Release old eviction fence and create new one. Use context and mm
-	 * from the old fence.
-	 */
-	old_fence = process_info->eviction_fence;
-	process_info->eviction_fence =
-		amdgpu_amdkfd_fence_create(old_fence->base.context,
-					   old_fence->mm);
-	dma_fence_put(&old_fence->base);
-	if (!process_info->eviction_fence) {
-		pr_err("Failed to create eviction fence\n");
-		goto evict_fence_fail;
-	}
-
 	i = 0;
 	mutex_lock(&process_info->lock);
 	list_for_each_entry(peer_vm, &process_info->vm_list_head,
@@ -2386,6 +2373,21 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 
 	amdgpu_sync_wait(ctx.sync, false);
 
+	/* Release old eviction fence and create new one, because fence only
+	 * goes from unsignaled to signaled, fence cannot be reused.
+	 * Use context and mm from the old fence.
+	 */
+	new_fence = amdgpu_amdkfd_fence_create(
+				process_info->eviction_fence->base.context,
+				process_info->eviction_fence->mm);
+	if (!new_fence) {
+		pr_err("Failed to create eviction fence\n");
+		ret = -ENOMEM;
+		goto validate_map_fail;
+	}
+	dma_fence_put(&process_info->eviction_fence->base);
+	process_info->eviction_fence = new_fence;
+
 	/* Wait for validate to finish and attach new eviction fence */
 	list_for_each_entry(mem, &process_info->kfd_bo_list,
 		validate_list.head)
-- 
2.7.4