aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1320-drm-amdkfd-Error-handling-fixes-ported-from-upstream.patch
blob: f284104ef9d359fb6e200c26a6ea31057229bfa8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
From 26e74ca954c8a65f517ad546e4f1592d978e48b6 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Fri, 28 Jul 2017 20:29:05 -0400
Subject: [PATCH 1320/4131] drm/amdkfd: Error handling fixes ported from
 upstream

When a packet buffer that was acquired from a kernel queue cannot be
submitted due to errors, it must be rolled back to keep the state
of the kernel queue consistent.

Destroy queue in case of failure in dbgdev_register_diq.

Return error and don't keep going in case of error in
dbgdev_wave_control_set_registers.

Add back error checks that were removed when BUG_ONs were eliminated.
Remove redundant WARN_ON about NULL pointers that will cause a BUG
anyway.

Change-Id: I56e82526e8dac87925e27f7019e0659e9519c446
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c            |  5 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  7 ++++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  5 +++
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c      | 11 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c   |  8 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c   |  8 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c   |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c    |  3 ++
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    | 49 +++++++++++++++-------
 9 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 942d863..4c267a9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -126,6 +126,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 
 	if (status != 0) {
 		pr_err("Failed to allocate GART memory\n");
+		kq->ops.rollback_packet(kq);
 		return status;
 	}
 
@@ -213,6 +214,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
 
 	if (!kq) {
 		pr_err("Error getting Kernel Queue\n");
+		pqm_destroy_queue(dbgdev->pqm, qid);
 		return -ENOMEM;
 	}
 	dbgdev->kq = kq;
@@ -575,8 +577,7 @@ static int dbgdev_wave_control_set_registers(
 		break;
 
 	default:
-		status = -EINVAL;
-		break;
+		return -EINVAL;
 	}
 
 	switch (wac_info->operand) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index da05b68..35c0b554 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1074,6 +1074,13 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
 {
 	unsigned int num_of_longs;
 
+	if (WARN_ON(buf_size < chunk_size))
+		return -EINVAL;
+	if (WARN_ON(buf_size == 0))
+		return -EINVAL;
+	if (WARN_ON(chunk_size == 0))
+		return -EINVAL;
+
 	kfd->gtt_sa_chunk_size = chunk_size;
 	kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index bfc87c0..b78a773 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -535,6 +535,9 @@ static struct mqd_manager *get_mqd_manager_nocpsch(
 {
 	struct mqd_manager *mqd;
 
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
 	pr_debug("mqd type %d\n", type);
 
 	mqd = dqm->mqds[type];
@@ -789,6 +792,8 @@ static void uninitialize_nocpsch(struct device_queue_manager *dqm)
 {
 	int i;
 
+	WARN_ON(dqm->queue_count > 0 || dqm->processes_count > 0);
+
 	kfree(dqm->allocated_queues);
 	for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
 		kfree(dqm->mqds[i]);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index b303e57..6dc7e36 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -366,6 +366,7 @@ void kernel_queue_uninit(struct kernel_queue *kq)
 	kfree(kq);
 }
 
+/* FIXME: Can this test be removed? */
 static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
 {
 	struct kernel_queue *kq;
@@ -375,8 +376,18 @@ static __attribute__((unused)) void test_kq(struct kfd_dev *dev)
 	pr_err("Starting kernel queue test\n");
 
 	kq = kernel_queue_init(dev, KFD_QUEUE_TYPE_HIQ);
+	if (unlikely(!kq)) {
+		pr_err("  Failed to initialize HIQ\n");
+		pr_err("Kernel queue test failed\n");
+		return;
+	}
 
 	retval = kq->ops.acquire_packet_buffer(kq, 5, &buffer);
+	if (unlikely(retval != 0)) {
+		pr_err("  Failed to acquire packet buffer\n");
+		pr_err("Kernel queue test failed\n");
+		return;
+	}
 	for (i = 0; i < 5; i++)
 		buffer[i] = kq->nop_packet;
 	kq->ops.submit_packet(kq);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 89edf3c..beb8732 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -189,7 +189,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "queue type %d\n", q->properties.type);
-		break;
+		return -EINVAL;
 	}
 	packet->bitfields3.doorbell_offset =
 			q->properties.doorbell_off;
@@ -234,7 +234,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "queue type %d\n", type);
-		break;
+		return -EINVAL;
 	}
 
 	if (reset)
@@ -267,7 +267,7 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "filter %d\n", filter);
-		break;
+		return -EINVAL;
 	}
 
 	return 0;
@@ -305,8 +305,6 @@ static uint32_t pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
 {
 	struct pm4_mec_release_mem *packet;
 
-	WARN_ON(!buffer);
-
 	packet = (struct pm4_mec_release_mem *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index 007a3ea..5fbc5a0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -194,7 +194,7 @@ int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "queue type %d\n", q->properties.type);
-		break;
+		return -EINVAL;
 	}
 	packet->bitfields3.doorbell_offset =
 			q->properties.doorbell_off;
@@ -267,7 +267,7 @@ int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "queue type %d\n", type);
-		break;
+		return -EINVAL;
 	}
 
 	if (reset)
@@ -300,7 +300,7 @@ int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 		break;
 	default:
 		WARN(1, "filter %d\n", filter);
-		break;
+		return -EINVAL;
 	}
 
 	return 0;
@@ -338,8 +338,6 @@ uint32_t pm_release_mem_vi(uint64_t gpu_addr, uint32_t *buffer)
 {
 	struct pm4_mec_release_mem *packet;
 
-	WARN_ON(!buffer);
-
 	packet = (struct pm4_mec_release_mem *)buffer;
 	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index d50e32b..71e7521 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -485,6 +485,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 {
 	struct mqd_manager *mqd;
 
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
 	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
 	if (!mqd)
 		return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 58dbd85..e698fc1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -487,6 +487,9 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 {
 	struct mqd_manager *mqd;
 
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
 	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
 	if (!mqd)
 		return NULL;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index bd419d6..0206d54 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -92,6 +92,9 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
 {
 	int retval;
 
+	if (WARN_ON(pm->allocated))
+		return -EINVAL;
+
 	pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
 
 	mutex_lock(&pm->lock);
@@ -203,15 +206,16 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
 	pr_debug("Finished map process and queues to runlist\n");
 
 	if (is_over_subscription)
-		pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr,
-				alloc_size_bytes / sizeof(uint32_t), true);
+		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
+					*rl_gpu_addr,
+					alloc_size_bytes / sizeof(uint32_t),
+					true);
 
 	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
 		pr_debug("0x%2X ", rl_buffer[i]);
-
 	pr_debug("\n");
 
-	return 0;
+	return retval;
 }
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
@@ -259,6 +263,7 @@ int pm_send_set_resources(struct packet_manager *pm,
 				struct scheduling_resources *res)
 {
 	uint32_t *buffer, size;
+	int retval = 0;
 
 	size = pm->pmf->get_set_resources_packet_size();
 	mutex_lock(&pm->lock);
@@ -266,18 +271,21 @@ int pm_send_set_resources(struct packet_manager *pm,
 				size / sizeof(uint32_t),
 				(unsigned int **)&buffer);
 	if (!buffer) {
-		mutex_unlock(&pm->lock);
 		pr_err("Failed to allocate buffer on kernel queue\n");
-		return -ENOMEM;
+		retval = -ENOMEM;
+		goto out;
 	}
 
-	pm->pmf->set_resources(pm, buffer, res);
-
-	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	retval = pm->pmf->set_resources(pm, buffer, res);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
 
+out:
 	mutex_unlock(&pm->lock);
 
-	return 0;
+	return retval;
 }
 
 int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues)
@@ -330,6 +338,9 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 	uint32_t *buffer, size;
 	int retval = 0;
 
+	if (WARN_ON(!fence_address))
+		return -EFAULT;
+
 	size = pm->pmf->get_query_status_packet_size();
 	mutex_lock(&pm->lock);
 	pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue,
@@ -339,8 +350,12 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address,
 		retval = -ENOMEM;
 		goto out;
 	}
-	pm->pmf->query_status(pm, buffer, fence_address, fence_value);
-	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+
+	retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
 
 out:
 	mutex_unlock(&pm->lock);
@@ -364,9 +379,13 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
 		retval = -ENOMEM;
 		goto out;
 	}
-	pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, reset,
-			      sdma_engine);
-	pm->priv_queue->ops.submit_packet(pm->priv_queue);
+
+	retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param,
+				       reset, sdma_engine);
+	if (!retval)
+		pm->priv_queue->ops.submit_packet(pm->priv_queue);
+	else
+		pm->priv_queue->ops.rollback_packet(pm->priv_queue);
 
 out:
 	mutex_unlock(&pm->lock);
-- 
2.7.4