aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1344-drm-amdkfd-Remove-indiscriminate-resetting-of-queues.patch
blob: a3d6474b00329c27b5c8993ac5d4c8a202c38be4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
From e44332781979018efc12a102ff6943a5c1d340c1 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Sun, 24 Sep 2017 00:54:33 -0400
Subject: [PATCH 1344/4131] drm/amdkfd: Remove indiscriminate resetting of
 queues

Resetting queues affects all processes. We can't allow any action
triggered by a user mode process to affect other processes. Therefore
process termination and VM faults cannot be allowed to reset queues
indiscriminately for all processes.

Change-Id: I41f0a7426ac0825041548e0718cb236be417d75d
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c   | 11 +---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  4 +-
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 65 +++++++++-------------
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.h  |  3 +-
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c    |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |  3 +-
 6 files changed, 33 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 5f122a1..00536a1 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -95,17 +95,10 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
 		ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
 		struct kfd_vm_fault_info info;
 
+		kfd_process_vm_fault(dev->dqm, ihre->pasid);
+
 		memset(&info, 0, sizeof(info));
 		dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
-		/* When CWSR is disabled, we choose to reset the device, which
-		 * will reset the queues from other processes on this device.
-		 * This is a bug that we accept given by-pasid reset does not
-		 * work well.
-		 */
-		if (dev->cwsr_enabled)
-			kfd_process_vm_fault(dev->dqm, ihre->pasid, false);
-		else
-			kfd_process_vm_fault(dev->dqm, ihre->pasid, true);
 		if (!info.page_addr && !info.status)
 			return;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index acee0aa..af2424e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -800,7 +800,7 @@ static int quiesce_process_mm(struct kfd_process *p)
 	unsigned int n_evicted = 0;
 
 	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
-		r = process_evict_queues(pdd->dev->dqm, &pdd->qpd, false);
+		r = process_evict_queues(pdd->dev->dqm, &pdd->qpd);
 		if (r != 0) {
 			pr_err("Failed to evict process queues\n");
 			goto fail;
@@ -872,7 +872,7 @@ int kgd2kfd_quiesce_mm(struct kfd_dev *kfd, struct mm_struct *mm)
 		r = -ENODEV;
 		pdd = kfd_get_process_device_data(kfd, p);
 		if (pdd)
-			r = process_evict_queues(kfd->dqm, &pdd->qpd, false);
+			r = process_evict_queues(kfd->dqm, &pdd->qpd);
 	} else {
 		r = quiesce_process_mm(p);
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 3a09cbc..2d8c238 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -45,11 +45,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
-				bool static_queues_included,
-				bool reset);
+				bool static_queues_included);
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		enum kfd_unmap_queues_filter filter,
-		uint32_t filter_param, bool reset);
+		uint32_t filter_param);
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
@@ -503,8 +502,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	/* HWS mode, unmap first to own mqd */
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
 		retval = unmap_queues_cpsch(dqm,
-				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
-				false);
+				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 		if (retval) {
 			pr_err("unmap queue failed");
 			goto out_unlock;
@@ -567,8 +565,7 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
 }
 
 int process_evict_queues(struct device_queue_manager *dqm,
-			 struct qcm_process_device *qpd,
-			 bool reset)
+			 struct qcm_process_device *qpd)
 {
 	struct queue *q, *next;
 	struct mqd_manager *mqd;
@@ -607,7 +604,7 @@ int process_evict_queues(struct device_queue_manager *dqm,
 			dqm->queue_count--;
 	}
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
-		retval = execute_queues_cpsch(dqm, qpd->is_debug, reset);
+		retval = execute_queues_cpsch(dqm, qpd->is_debug);
 
 out:
 	mutex_unlock(&dqm->lock);
@@ -677,7 +674,7 @@ int process_restore_queues(struct device_queue_manager *dqm,
 		}
 	}
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
-		retval = execute_queues_cpsch(dqm, false, false);
+		retval = execute_queues_cpsch(dqm, false);
 
 	if (retval == 0)
 		qpd->evicted = 0;
@@ -998,7 +995,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	init_interrupts(dqm);
 
 	mutex_lock(&dqm->lock);
-	execute_queues_cpsch(dqm, false, false);
+	execute_queues_cpsch(dqm, false);
 	mutex_unlock(&dqm->lock);
 
 	return 0;
@@ -1013,7 +1010,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 {
 	mutex_lock(&dqm->lock);
 
-	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
+	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
 
 	mutex_unlock(&dqm->lock);
 
@@ -1046,7 +1043,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_add(&kq->list, &qpd->priv_queue_list);
 	dqm->queue_count++;
 	qpd->is_debug = true;
-	execute_queues_cpsch(dqm, false, false);
+	execute_queues_cpsch(dqm, false);
 	mutex_unlock(&dqm->lock);
 
 	return 0;
@@ -1061,7 +1058,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 	list_del(&kq->list);
 	dqm->queue_count--;
 	qpd->is_debug = false;
-	execute_queues_cpsch(dqm, true, false);
+	execute_queues_cpsch(dqm, true);
 	/*
 	 * Unconditionally decrement this counter, regardless of the queue's
 	 * type.
@@ -1135,7 +1132,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	qpd->queue_count++;
 	if (q->properties.is_active) {
 		dqm->queue_count++;
-		retval = execute_queues_cpsch(dqm, false, false);
+		retval = execute_queues_cpsch(dqm, false);
 	}
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
@@ -1183,11 +1180,10 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 }
 
 static int unmap_sdma_queues(struct device_queue_manager *dqm,
-			     unsigned int sdma_engine,
-			     bool reset)
+			     unsigned int sdma_engine)
 {
 	return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
-			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, reset,
+			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
 			sdma_engine);
 }
 
@@ -1208,7 +1204,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 
 	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
 	if (retval) {
-		pr_err("failed to execute runlist");
+		pr_err("failed to execute runlist\n");
 		return retval;
 	}
 	dqm->active_runlist = true;
@@ -1219,7 +1215,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 /* dqm->lock mutex has to be locked before calling this function */
 static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		enum kfd_unmap_queues_filter filter,
-		uint32_t filter_param, bool reset)
+		uint32_t filter_param)
 {
 	int retval;
 
@@ -1232,12 +1228,12 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 		dqm->sdma_queue_count);
 
 	if (dqm->sdma_queue_count > 0) {
-		unmap_sdma_queues(dqm, 0, reset);
-		unmap_sdma_queues(dqm, 1, reset);
+		unmap_sdma_queues(dqm, 0);
+		unmap_sdma_queues(dqm, 1);
 	}
 
 	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
-			filter, filter_param, reset, 0);
+			filter, filter_param, false, 0);
 	if (retval)
 		return retval;
 
@@ -1248,7 +1244,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
 				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
 	if (retval) {
-		pr_err("%s queues failed.", reset ? "Resetting" : "Unmapping");
+		pr_err("Unmapping queues failed.\n");
 		return retval;
 	}
 
@@ -1260,8 +1256,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 
 /* dqm->lock mutex has to be locked before calling this function */
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
-				bool static_queues_included,
-				bool reset)
+				bool static_queues_included)
 {
 	int retval;
 	enum kfd_unmap_queues_filter filter;
@@ -1270,9 +1265,9 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
 			KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
 			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
 
-	retval = unmap_queues_cpsch(dqm, filter, 0, reset);
+	retval = unmap_queues_cpsch(dqm, filter, 0);
 	if (retval) {
-		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption");
+		pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
 		return retval;
 	}
 
@@ -1325,7 +1320,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 	if (q->properties.is_active)
 		dqm->queue_count--;
 
-	retval = execute_queues_cpsch(dqm, false, false);
+	retval = execute_queues_cpsch(dqm, false);
 	if (retval == -ETIME)
 		qpd->reset_wavefronts = true;
 
@@ -1552,15 +1547,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		}
 	}
 
-	/* When CWSR is disabled, we choose to reset the device, which will
-	 * reset the queues from other processes on this device. This is
-	 * a bug that we accept given by-pasid reset does not work well.
-	 */
-	if (dqm->dev->cwsr_enabled)
-		retval = execute_queues_cpsch(dqm, true, false);
-	else
-		retval = execute_queues_cpsch(dqm, true, true);
-
+	retval = execute_queues_cpsch(dqm, true);
 	if (retval || qpd->reset_wavefronts) {
 		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
 		dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
@@ -1692,7 +1679,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 }
 
 int kfd_process_vm_fault(struct device_queue_manager *dqm,
-			 unsigned int pasid, bool reset)
+			 unsigned int pasid)
 {
 	struct kfd_process_device *pdd;
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -1702,7 +1689,7 @@ int kfd_process_vm_fault(struct device_queue_manager *dqm,
 		return -EINVAL;
 	pdd = kfd_get_process_device_data(dqm->dev, p);
 	if (pdd)
-		ret = process_evict_queues(dqm, &pdd->qpd, reset);
+		ret = process_evict_queues(dqm, &pdd->qpd);
 	kfd_unref_process(p);
 
 	return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index a492307..841283a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -216,8 +216,7 @@ unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 
 int process_evict_queues(struct device_queue_manager *dqm,
-		struct qcm_process_device *qpd,
-		bool reset);
+		struct qcm_process_device *qpd);
 int process_restore_queues(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index ccfc89a..b2c6b52 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -122,7 +122,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 		info.prot_read  = ring_id & 0x10;
 		info.prot_write = ring_id & 0x20;
 
-		kfd_process_vm_fault(dev->dqm, pasid, false);
+		kfd_process_vm_fault(dev->dqm, pasid);
 		kfd_signal_vm_fault_event(dev, pasid, &info);
 	}
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index c853956..43a8838 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -892,8 +892,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 					enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
-int kfd_process_vm_fault(struct device_queue_manager *dqm,
-			 unsigned int pasid, bool reset);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
 struct process_queue_node {
-- 
2.7.4