aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1548-drm-amdkfd-Move-resume_mm-quiesce_mm-out-of-reserve-.patch
blob: 547e3cd2600d425faa6fd28df56d6101c29d9b41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
From 8597c8ba04362ffa14c9914ec84e0a969153e78e Mon Sep 17 00:00:00 2001
From: Yong Zhao <yong.zhao@amd.com>
Date: Mon, 24 Oct 2016 17:16:01 -0400
Subject: [PATCH 1548/4131] drm/amdkfd: Move resume_mm/quiesce_mm out of
 reserve/unreserve scope

resume_mm/quiesce_mm should be called without bo and VMs reserved.
Otherwise, deadlock could happen, as the PD BO of the VM could be
reserved for the second time when get_process_page_dir() is called.

Change-Id: I6e268e34d43edee5a4beb29ca1ee55de41825787
Signed-off-by: Yong Zhao <yong.zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 83 +++++++++++++++++-------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index ddb9cab..6581539 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1061,6 +1061,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 	struct kfd_bo_va_list *bo_va_entry = NULL;
 	struct kfd_bo_va_list *bo_va_entry_aql = NULL;
 	struct amdkfd_vm *kfd_vm = (struct amdkfd_vm *)vm;
+	int num_to_quiesce = 0;
 
 	BUG_ON(kgd == NULL);
 	BUG_ON(mem == NULL);
@@ -1126,14 +1127,12 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 		if (entry->bo_va->vm == vm && !entry->is_mapped) {
 			if (mem->evicted) {
 				/* If the BO is evicted, just mark the
-				 * mapping as mapped and stop the GPU's
-				 * queues until the BO is restored. */
-				ret = kgd2kfd->quiesce_mm(adev->kfd,
-							  current->mm);
-				if (ret != 0)
-					goto quiesce_failed;
+				 * mapping as mapped and the GPU's queues
+				 * will be stopped later.
+				 */
 				entry->is_mapped = true;
 				mem->mapped_to_gpu_memory++;
+				num_to_quiesce++;
 				continue;
 			}
 
@@ -1158,11 +1157,23 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 				true);
 	unreserve_bo_and_vms(&ctx, true);
 
+	while (num_to_quiesce--) {
+		/* Now stop the GPU's queues while bo and VMs are unreserved.
+		 * quiesce_mm() is reference counted, and that is why we can
+		 * call it multiple times.
+		 */
+		ret = kgd2kfd->quiesce_mm(adev->kfd, current->mm);
+		if (ret != 0) {
+			pr_err("quiesce_mm() failed\n");
+			reserve_bo_and_vm(mem, vm, &ctx);
+			goto map_bo_to_gpuvm_failed;
+		}
+	}
+
 	mutex_unlock(&mem->lock);
-	return 0;
+	return ret;
 
 map_bo_to_gpuvm_failed:
-quiesce_failed:
 update_user_pages_failed:
 	if (bo_va_entry_aql)
 		remove_bo_from_vm(adev, bo_va_entry_aql);
@@ -1349,6 +1360,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 	unsigned mapped_before;
 	int ret = 0;
 	struct bo_vm_reservation_context ctx;
+	int num_to_resume = 0;
 
 	BUG_ON(kgd == NULL);
 	BUG_ON(mem == NULL);
@@ -1381,14 +1393,12 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 		if (entry->bo_va->vm == vm && entry->is_mapped) {
 			if (mem->evicted) {
 				/* If the BO is evicted, just mark the
-				 * mapping as unmapped and allow the
-				 * GPU's queues to resume. */
-				ret = kgd2kfd->resume_mm(adev->kfd,
-							 current->mm);
-				if (ret != 0)
-					goto unreserve_out;
+				 * mapping as unmapped and the GPU's queues
+				 * will be resumed later.
+				 */
 				entry->is_mapped = false;
 				mem->mapped_to_gpu_memory--;
+				num_to_resume++;
 				continue;
 			}
 
@@ -1430,6 +1440,18 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 
 unreserve_out:
 	unreserve_bo_and_vms(&ctx, false);
+
+	while (num_to_resume--) {
+		/* Now resume GPU's queues while bo and VMs are unreserved.
+		 * resume_mm() is reference counted, and that is why we can
+		 * call it multiple times.
+		 */
+		ret = kgd2kfd->resume_mm(adev->kfd, current->mm);
+		if (ret != 0) {
+			pr_err("resume_mm() failed.\n");
+			break;
+		}
+	}
 out:
 	mutex_unlock(&mem->lock);
 	return ret;
@@ -1694,7 +1716,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd, int dma_buf_fd,
 int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 {
 	struct kfd_bo_va_list *entry;
-	unsigned n_evicted;
+	unsigned int n_evicted = 0, n_unmapped = 0;
 	int r = 0;
 	struct bo_vm_reservation_context ctx;
 
@@ -1708,11 +1730,6 @@ int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 	 * queues of the affected GPUs are quiesced first. Count the
 	 * number of evicted mappings so we can roll back if something
 	 * goes wrong. */
-	n_evicted = 0;
-
-	r = reserve_bo_and_cond_vms(mem, NULL, VA_MAPPED, &ctx);
-	if (unlikely(r != 0))
-		return r;
 
 	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
 		struct amdgpu_device *adev;
@@ -1728,16 +1745,31 @@ int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 			goto fail;
 		}
 
+		n_evicted++;
+	}
+
+	r = reserve_bo_and_cond_vms(mem, NULL, VA_MAPPED, &ctx);
+	if (unlikely(r != 0))
+		goto fail;
+
+	list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
+		struct amdgpu_device *adev;
+
+		if (!entry->is_mapped)
+			continue;
+
+		adev = (struct amdgpu_device *)entry->kgd_dev;
+
 		r = unmap_bo_from_gpuvm(adev, mem->bo,
 					entry->bo_va, &ctx.sync);
 		if (r != 0) {
 			pr_err("failed unmap va 0x%llx\n",
 			       mem->va);
-			kgd2kfd->resume_mm(adev->kfd, mm);
+			unreserve_bo_and_vms(&ctx, true);
 			goto fail;
 		}
 
-		n_evicted++;
+		n_unmapped++;
 	}
 
 	unreserve_bo_and_vms(&ctx, true);
@@ -1745,7 +1777,6 @@ int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 	return 0;
 
 fail:
-	unreserve_bo_and_vms(&ctx, true);
 	/* To avoid hangs and keep state consistent, roll back partial
 	 * eviction by restoring queues and marking mappings as
 	 * unmapped. Access to now unmapped buffers will fault. */
@@ -1757,12 +1788,14 @@ int amdgpu_amdkfd_gpuvm_evict_mem(struct kgd_mem *mem, struct mm_struct *mm)
 		if (!entry->is_mapped)
 			continue;
 
-		entry->is_mapped = false;
+		if (n_unmapped) {
+			entry->is_mapped = false;
+			n_unmapped--;
+		}
 
 		adev = (struct amdgpu_device *)entry->kgd_dev;
 		if (kgd2kfd->resume_mm(adev->kfd, mm))
 			pr_err("Failed to resume KFD\n");
-
 		n_evicted--;
 	}
 
-- 
2.7.4