aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/1727-drm-amd-Implement-parallel-memory-mapping-on-mGPUs.patch
blob: 44040fbccb0642730f7bf9c10360b76f3940cdb7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
From 909b82ea5625d797f9bde9be6378ba3ee8a55ec5 Mon Sep 17 00:00:00 2001
From: Lan Xiao <Lan.Xiao@amd.com>
Date: Fri, 23 Jun 2017 16:06:48 -0400
Subject: [PATCH 1727/4131] drm/amd: Implement parallel memory mapping on mGPUs

Alter the KFD-KGD interface to optimize multi-GPU memory mappings to
work concurrently instead of sequentially. Return the fences
during the process, wait for all fences after the mappings are done.
The fences are stored in the associated kgd_mem object.

This change also enables interruptible waiting with proper signal
handling

Change-Id: I9ae7f4bd54165b14dd5b37df5df6516aa80cba83
Signed-off-by: Lan Xiao <Lan.Xiao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h        |  4 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 74 +++++++++++++++++------
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c          |  7 +++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c          |  8 +++
 drivers/gpu/drm/amd/include/kgd_kfd_interface.h   |  2 +
 8 files changed, 78 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ba1e24c9..924e28a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -64,6 +64,8 @@ struct kgd_mem {
         struct amdkfd_process_info *process_info;
         struct page **user_pages;
 
+        struct amdgpu_sync sync;
+
 
         /* flags bitfield */
 	bool coherent      : 1;
@@ -190,6 +192,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
 	})
 
 /* GPUVM API */
+int amdgpu_amdkfd_gpuvm_sync_memory(
+		struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 		struct kgd_dev *kgd, uint64_t va, uint64_t size,
 		void *vm, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 4549dc0..0b2595e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -216,6 +216,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
 	.read_vmid_from_vmfault_reg = read_vmid_from_vmfault_reg,
 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
 	.invalidate_tlbs = invalidate_tlbs,
+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
 	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
 	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
 	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index 76e3d5d..08da99f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -189,6 +189,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
 			get_atc_vmid_pasid_mapping_valid,
 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
 	.invalidate_tlbs = invalidate_tlbs,
+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
 	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
 	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
 	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index d10d213..42e0094 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -234,6 +234,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
 			get_atc_vmid_pasid_mapping_valid,
 	.write_vmid_invalidate_request = write_vmid_invalidate_request,
 	.invalidate_tlbs = invalidate_tlbs,
+	.sync_memory = amdgpu_amdkfd_gpuvm_sync_memory,
 	.alloc_memory_of_gpu = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu,
 	.free_memory_of_gpu = amdgpu_amdkfd_gpuvm_free_memory_of_gpu,
 	.map_memory_to_gpu = amdgpu_amdkfd_gpuvm_map_memory_to_gpu,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 8384dfb..475e7fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -655,6 +655,8 @@ static int __alloc_memory_of_gpu(struct kgd_dev *kgd, uint64_t va,
 
 	alloc_domain = userptr ? AMDGPU_GEM_DOMAIN_CPU : domain;
 
+	amdgpu_sync_create(&(*mem)->sync);
+
 	ret = amdgpu_amdkfd_reserve_system_mem_limit(adev, size, alloc_domain);
 	if (ret) {
 		pr_err("Insufficient system memory\n");
@@ -730,7 +732,7 @@ struct bo_vm_reservation_context {
 	struct amdgpu_bo_list_entry *vm_pd;
 	struct ww_acquire_ctx ticket;
 	struct list_head list, duplicates;
-	struct amdgpu_sync sync;
+	struct amdgpu_sync *sync;
 	bool reserved;
 };
 
@@ -751,7 +753,7 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
 
 	ctx->reserved = false;
 	ctx->n_vms = 1;
-	amdgpu_sync_create(&ctx->sync);
+	ctx->sync = &mem->sync;
 
 	INIT_LIST_HEAD(&ctx->list);
 	INIT_LIST_HEAD(&ctx->duplicates);
@@ -812,7 +814,7 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
 	ctx->reserved = false;
 	ctx->n_vms = 0;
 	ctx->vm_pd = NULL;
-	amdgpu_sync_create(&ctx->sync);
+	ctx->sync = &mem->sync;
 
 	INIT_LIST_HEAD(&ctx->list);
 	INIT_LIST_HEAD(&ctx->duplicates);
@@ -867,19 +869,27 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
 	return ret;
 }
 
-static void unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
-				 bool wait)
+static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
+				 bool wait, bool intr)
 {
-	if (wait) /* FIXME: when called from user context, this needs to be interruptible */
-		amdgpu_sync_wait(&ctx->sync, false);
+	int ret = 0;
+
+	if (wait) {
+		ret = amdgpu_sync_wait(ctx->sync, intr);
+		if (ret)
+			return ret;
+	}
 
 	if (ctx->reserved)
 		ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
 	kfree(ctx->vm_pd);
 
-	amdgpu_sync_free(&ctx->sync);
+	ctx->sync = NULL;
+
 	ctx->reserved = false;
 	ctx->vm_pd = NULL;
+
+	return ret;
 }
 
 static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
@@ -1051,6 +1061,25 @@ static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
 	return sg;
 }
 
+int amdgpu_amdkfd_gpuvm_sync_memory(
+		struct kgd_dev *kgd, struct kgd_mem *mem, bool intr)
+{
+	int ret = 0;
+	struct amdgpu_sync sync;
+	struct amdgpu_device *adev;
+
+	adev = get_amdgpu_device(kgd);
+	amdgpu_sync_create(&sync);
+
+	mutex_lock(&mem->lock);
+	amdgpu_sync_clone(adev, &mem->sync, &sync);
+	mutex_unlock(&mem->lock);
+
+	ret = amdgpu_sync_wait(&sync, intr);
+	amdgpu_sync_free(&sync);
+	return ret;
+}
+
 #define BOOL_TO_STR(b)	(b == true) ? "true" : "false"
 
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
@@ -1137,7 +1166,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 	struct amdgpu_device *adev;
 	struct kfd_bo_va_list *entry, *tmp;
 	struct bo_vm_reservation_context ctx;
-	int ret;
+	int ret = 0;
 	struct ttm_validate_buffer *bo_list_entry;
 	struct amdkfd_process_info *process_info;
 	unsigned long bo_size;
@@ -1199,7 +1228,10 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 				entry, bo_size);
 	}
 
-	unreserve_bo_and_vms(&ctx, false);
+	ret = unreserve_bo_and_vms(&ctx, false, true);
+
+	/* Free the sync object */
+	amdgpu_sync_free(&mem->sync);
 
 	/* If the SG is not NULL, it's one we created for a doorbell
 	 * BO. We need to free it.
@@ -1213,7 +1245,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
 	amdgpu_bo_unref(&mem->bo);
 	kfree(mem);
 
-	return 0;
+	return ret;
 }
 
 int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
@@ -1308,7 +1340,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 					entry->va, entry->va + bo_size,
 					entry);
 
-			ret = map_bo_to_gpuvm(adev, entry, &ctx.sync,
+			ret = map_bo_to_gpuvm(adev, entry, ctx.sync,
 					      is_invalid_userptr);
 			if (ret != 0) {
 				pr_err("Failed to map radeon bo to gpuvm\n");
@@ -1325,7 +1357,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 		amdgpu_bo_fence(bo,
 				&kfd_vm->process_info->eviction_fence->base,
 				true);
-	unreserve_bo_and_vms(&ctx, true);
+	ret = unreserve_bo_and_vms(&ctx, false, true);
 
 	mutex_unlock(&mem->process_info->lock);
 	mutex_unlock(&mem->lock);
@@ -1338,7 +1370,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
 	if (bo_va_entry)
 		remove_bo_from_vm(adev, bo_va_entry, bo_size);
 add_bo_to_vm_failed:
-	unreserve_bo_and_vms(&ctx, false);
+	unreserve_bo_and_vms(&ctx, false, false);
 bo_reserve_failed:
 	mutex_unlock(&mem->process_info->lock);
 	mutex_unlock(&mem->lock);
@@ -1569,7 +1601,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 					entry->va + bo_size,
 					entry);
 
-			ret = unmap_bo_from_gpuvm(adev, entry, &ctx.sync);
+			ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync);
 			if (ret == 0) {
 				entry->is_mapped = false;
 			} else {
@@ -1600,7 +1632,7 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
 	}
 
 unreserve_out:
-	unreserve_bo_and_vms(&ctx, false);
+	unreserve_bo_and_vms(&ctx, false, false);
 out:
 	mutex_unlock(&mem->lock);
 	return ret;
@@ -2235,6 +2267,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 	struct amdgpu_amdkfd_fence *old_fence;
 	int ret = 0, i;
 	struct list_head duplicate_save;
+	struct amdgpu_sync sync_obj;
 
 	INIT_LIST_HEAD(&duplicate_save);
 	INIT_LIST_HEAD(&ctx.list);
@@ -2287,7 +2320,8 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 	if (!list_empty(&duplicate_save))
 		pr_err("BUG: list of BOs to reserve has duplicates!\n");
 
-	amdgpu_sync_create(&ctx.sync);
+	amdgpu_sync_create(&sync_obj);
+	ctx.sync = &sync_obj;
 
 	/* Validate PDs and PTs */
 	ret = process_validate_vms(process_info);
@@ -2322,7 +2356,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 			ret = update_gpuvm_pte((struct amdgpu_device *)
 					      bo_va_entry->kgd_dev,
 					      bo_va_entry,
-					      &ctx.sync);
+					      ctx.sync);
 			if (ret) {
 				pr_debug("Memory eviction: update PTE failed. Try again\n");
 				goto validate_map_fail;
@@ -2330,7 +2364,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 		}
 	}
 
-	amdgpu_sync_wait(&ctx.sync, false);
+	amdgpu_sync_wait(ctx.sync, false);
 
 	/* Wait for validate to finish and attach new eviction fence */
 	list_for_each_entry(mem, &process_info->kfd_bo_list,
@@ -2350,7 +2384,7 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info)
 	}
 validate_map_fail:
 	ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list);
-	amdgpu_sync_free(&ctx.sync);
+	amdgpu_sync_free(&sync_obj);
 ttm_reserve_fail:
 	mutex_unlock(&process_info->lock);
 evict_fence_fail:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 64a4373..dbc3afd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1398,6 +1398,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
 			pr_err("Failed to map\n");
 	}
 
+	err = dev->kfd2kgd->sync_memory(dev->kgd, (struct kgd_mem *) mem, true);
+	if (err) {
+		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+		goto sync_memory_failed;
+	}
+
 	if (args->device_ids_array_size > 0 && devices_arr)
 		kfree(devices_arr);
 
@@ -1407,6 +1413,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
 	up_write(&p->lock);
 get_mem_obj_from_handle_failed:
 copy_from_user_failed:
+sync_memory_failed:
 	kfree(devices_arr);
 	return err;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d1ef118..f5e2282 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -136,6 +136,13 @@ static int kfd_process_alloc_gpuvm(struct kfd_process *p,
 	if (err)
 		goto err_map_mem;
 
+	err = kdev->kfd2kgd->sync_memory(kdev->kgd, (struct kgd_mem *) mem,
+				true);
+	if (err) {
+		pr_debug("Sync memory failed, wait interrupted by user signal\n");
+		goto sync_memory_failed;
+	}
+
 	/* Create an obj handle so kfd_process_device_remove_obj_handle
 	 * will take care of the bo removal when the process finishes.
 	 * We do not need to take p->lock, because the process is just
@@ -151,6 +158,7 @@ static int kfd_process_alloc_gpuvm(struct kfd_process *p,
 	return err;
 
 free_gpuvm:
+sync_memory_failed:
 	kfd_process_free_gpuvm(mem, pdd);
 	return err;
 
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index c0c1cc7..1364429 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -337,6 +337,8 @@ struct kfd2kgd_calls {
 
 	int (*invalidate_tlbs)(struct kgd_dev *kgd, uint16_t pasid);
 
+	int (*sync_memory)(struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
+
 	int (*alloc_memory_of_gpu)(struct kgd_dev *kgd, uint64_t va,
 			uint64_t size, void *vm,
 			struct kgd_mem **mem, uint64_t *offset,
-- 
2.7.4