1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
From b7cfd306079e1746315d6d2ad5c837623a07cbaf Mon Sep 17 00:00:00 2001
From: Philip Yang <Philip.Yang@amd.com>
Date: Fri, 20 Oct 2017 14:32:27 -0400
Subject: [PATCH 2131/4131] drm/amdkfd: fix hang issue if KFD memory failed to
restore
In current logic, if restore failed, the restore work is rescheduled,
and new fence is created for process->eviction_fence, but the new fence
will attach to BOs created before the next restore, as a result, next
restore will start evict work because new fence is not signaled while
ttm_bo_wait on those new BOs. The eviction count can detect and skip
eviction but it schedules restore work again, this is recursive
eviction/restore.
The fix is to create new fence only after restore is finished completely.
BUG: SWDEV-134782
Change-Id: Ib2f25f8a50a5eb8247572a105085482dc7326733
Signed-off-by: Philip Yang <Philip.Yang@amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 30 +++++++++++++-----------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 900019d..5de8398 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2287,7 +2287,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
struct amdkfd_vm *peer_vm;
struct kgd_mem *mem;
struct bo_vm_reservation_context ctx;
- struct amdgpu_amdkfd_fence *old_fence;
+ struct amdgpu_amdkfd_fence *new_fence;
int ret = 0, i;
struct list_head duplicate_save;
struct amdgpu_sync sync_obj;
@@ -2302,19 +2302,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
if (pd_bo_list == NULL)
return -ENOMEM;
- /* Release old eviction fence and create new one. Use context and mm
- * from the old fence.
- */
- old_fence = process_info->eviction_fence;
- process_info->eviction_fence =
- amdgpu_amdkfd_fence_create(old_fence->base.context,
- old_fence->mm);
- dma_fence_put(&old_fence->base);
- if (!process_info->eviction_fence) {
- pr_err("Failed to create eviction fence\n");
- goto evict_fence_fail;
- }
-
i = 0;
mutex_lock(&process_info->lock);
list_for_each_entry(peer_vm, &process_info->vm_list_head,
@@ -2386,6 +2373,21 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
amdgpu_sync_wait(ctx.sync, false);
+ /* Release old eviction fence and create new one, because fence only
+ * goes from unsignaled to signaled, fence cannot be reused.
+ * Use context and mm from the old fence.
+ */
+ new_fence = amdgpu_amdkfd_fence_create(
+ process_info->eviction_fence->base.context,
+ process_info->eviction_fence->mm);
+ if (!new_fence) {
+ pr_err("Failed to create eviction fence\n");
+ ret = -ENOMEM;
+ goto validate_map_fail;
+ }
+ dma_fence_put(&process_info->eviction_fence->base);
+ process_info->eviction_fence = new_fence;
+
/* Wait for validate to finish and attach new eviction fence */
list_for_each_entry(mem, &process_info->kfd_bo_list,
validate_list.head)
--
2.7.4
|