aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/5637-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch
blob: 71bba1fa2f7333461b3433b66397b52abaf3d6d2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
From de7edd2adbdcbd3a34f3d1df96884b4a59904b29 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Mon, 26 Mar 2018 16:45:06 -0400
Subject: [PATCH 5637/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy

CMA userptr implementations are incomplete because it doesn't properly
handle if the BO is evicted. This patch handles the case where both
source and destination BOs are userptr. It is more efficient to use CPU
to do the copy in this case, very similar to process_vm_read/write()
functions.

Change-Id: I5d01d906f04190d71e8663785718060411dede4e
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>

Conflicts:
      drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 273 ++++++++++++++++++++++++-------
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   2 +
 mm/gup.c                                 |  11 ++
 3 files changed, 230 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 91223e2..8941312 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -35,6 +35,7 @@
 #include <linux/mman.h>
 #include <asm/processor.h>
 #include <linux/ptrace.h>
+#include <linux/pagemap.h>
 
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
@@ -1681,6 +1682,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep,
 }
 
 
+/* Maximum number of entries for process pages array which lives on stack */
+#define MAX_PP_STACK_COUNT 16
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *))
+
 /* Update cma_iter.cur_bo with KFD BO that is assocaited with
  * cma_iter.array.va_addr
  */
@@ -1729,7 +1736,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size)
 }
 
 static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
-                               struct kfd_process *p, struct cma_iter *ci)
+				struct kfd_process *p, struct mm_struct *mm,
+				struct task_struct *task, struct cma_iter *ci)
 {
        int ret;
        int nr;
@@ -1742,6 +1750,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
        ci->nr_segs = segs;
        ci->p = p;
        ci->offset = 0;
+       ci->mm = mm;
+       ci->task = task;
        for (nr = 0; nr < segs; nr++)
                ci->total += arr[nr].size;
 
@@ -1762,6 +1772,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci)
        return false;
 }
 
+/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes
+ * both source and dest. BOs are userptr BOs. Both BOs can either belong to
+ * current process or one of the BOs can belong to a differnt
+ * process. @Returns 0 on success, -ve on failure
+ *
+ * @si: Source iter
+ * @di: Dest. iter
+ * @cma_write: Indicates if it is write to remote or read from remote
+ * @size: amount of bytes to be copied
+ * @copied: Return number of bytes actually copied.
+ */
+static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di,
+				bool cma_write, uint64_t size,
+				uint64_t *copied)
+{
+	int i, ret = 0, locked;
+	unsigned int nents, nl;
+	unsigned int offset_in_page;
+	struct page *pp_stack[MAX_PP_STACK_COUNT];
+	struct page **process_pages = pp_stack;
+	unsigned long rva, lva = 0, flags = 0;
+	uint64_t copy_size, to_copy = size;
+	struct cma_iter *li, *ri;
+
+	if (cma_write) {
+		ri = di;
+		li = si;
+		flags |= FOLL_WRITE;
+	} else {
+		li = di;
+		ri = si;
+	}
+	/* rva: remote virtual address. Page aligned to start page.
+	 * rva + offset_in_page: Points to remote start address
+	 * lva: local virtual address. Points to the start address.
+	 * nents: computes number of remote pages to request
+	 */
+	offset_in_page = ri->bo_offset & (PAGE_SIZE - 1);
+	rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK;
+	lva = li->cur_bo->cpuva + li->bo_offset;
+
+	nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page);
+	*copied = 0;
+
+	if (nents > MAX_PP_STACK_COUNT) {
+		/* For reliability kmalloc only 2 pages worth */
+		process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES,
+					      sizeof(struct pages *)*nents),
+					GFP_KERNEL);
+
+		if (!process_pages)
+			return -ENOMEM;
+	}
+
+	while (nents && to_copy) {
+		nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents);
+		locked = 1;
+		down_read(&ri->mm->mmap_sem);
+		nl = get_user_pages_remote(ri->task, ri->mm, rva, nl,
+					   flags, process_pages, NULL,
+					   &locked);
+		if (locked)
+			up_read(&ri->mm->mmap_sem);
+		if (nl <= 0) {
+			pr_err("CMA: Invalid virtual address 0x%lx\n", rva);
+			ret = -EFAULT;
+			break;
+		}
+
+		for (i = 0; i < nl; i++) {
+			unsigned int n;
+			void *kaddr = kmap_atomic(process_pages[i]);
+
+			if (cma_write) {
+				n = copy_from_user(kaddr+offset_in_page,
+						   (void *)lva, copy_size);
+				set_page_dirty(process_pages[i]);
+			} else {
+				n = copy_to_user((void *)lva,
+						 kaddr+offset_in_page,
+						 copy_size);
+			}
+			kunmap_atomic(kaddr);
+			if (n) {
+				ret = -EFAULT;
+				break;
+			}
+			to_copy -= copy_size;
+			if (!to_copy)
+				break;
+			lva += copy_size;
+			rva += (copy_size + offset_in_page);
+			WARN_ONCE(rva & (PAGE_SIZE - 1),
+				  "CMA: Error in remote VA computation");
+			offset_in_page = 0;
+			copy_size = min_t(uint64_t, to_copy, PAGE_SIZE);
+		}
+
+		for (i = 0; i < nl; i++)
+			put_page(process_pages[i]);
+
+		if (ret)
+			break;
+		nents -= nl;
+	}
+
+	if (process_pages != pp_stack)
+		kfree(process_pages);
+
+	*copied = (size - to_copy);
+	return ret;
+
+}
+
+/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their
+ * respective offset.
+ * @si: Source iter
+ * @di: Dest. iter
+ * @cma_write: Indicates if it is write to remote or read from remote
+ * @size: amount of bytes to be copied
+ * @f: Return the last fence if any
+ * @copied: Return number of bytes actually copied.
+ */
+static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+			int cma_write, uint64_t size,
+			struct dma_fence **f, uint64_t *copied)
+{
+	int err = 0;
+	struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo;
+	uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
+	struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
+
+	*copied = 0;
+	if (f)
+		*f = NULL;
+	if (src_bo->cpuva && dst_bo->cpuva)
+		return kfd_copy_userptr_bos(si, di, cma_write, size, copied);
+
+	if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+		pr_err("CMA %d fail. Not same dev\n", cma_write);
+		err = -EINVAL;
+	}
+
+	err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem,
+						     src_offset, dst_mem,
+						     dst_offset, size, f,
+						     copied);
+
+	return err;
+}
+
 /* Copy single range from source iterator @si to destination iterator @di.
  * @si will move to next range and @di will move by bytes copied.
  * @return : 0 for success or -ve for failure
@@ -1772,57 +1935,55 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di,
                                 bool cma_write, struct dma_fence **f,
                                 uint64_t *copied)
 {
-       int err = 0;
-       uint64_t copy_size, n;
-       uint64_t size = si->array->size;
-       struct kfd_bo *src_bo = si->cur_bo;
-       struct dma_fence *lfence = NULL;
-
-       if (!src_bo || !di || !copied)
-               return -EINVAL;
-       *copied = 0;
-       if (f)
-               *f = NULL;
-
-       while (size && !kfd_cma_iter_end(di)) {
-               struct dma_fence *fence = NULL;
-               struct kfd_bo *dst_bo = di->cur_bo;
-
-               copy_size = min(size, (di->array->size - di->offset));
-
-               /* Check both BOs belong to same device */
-               if (src_bo->dev->kgd != dst_bo->dev->kgd) {
-                       pr_err("CMA fail. Not same dev\n");
-                       return -EINVAL;
-               }
-
-               err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd,
-                       src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset,
-                       copy_size, &fence, &n);
-               if (err) {
-                       pr_err("GPU CMA %d failed\n", err);
-                       break;
-               }
-
-               if (fence) {
-                       dma_fence_put(lfence);
-                       lfence = fence;
-               }
-               size -= n;
-               *copied += n;
-               err = kfd_cma_iter_advance(si, n);
-               if (err)
-                       break;
-               err = kfd_cma_iter_advance(di, n);
-               if (err)
-                       break;
-       }
-
-       if (f)
-               *f = dma_fence_get(lfence);
-       dma_fence_put(lfence);
-
-       return err;
+	int err = 0;
+	uint64_t copy_size, n;
+	uint64_t size = si->array->size;
+	struct kfd_bo *src_bo = si->cur_bo;
+	struct dma_fence *lfence = NULL;
+	
+	if (!src_bo || !di || !copied)
+	        return -EINVAL;
+	*copied = 0;
+	if (f)
+	        *f = NULL;
+	
+	while (size && !kfd_cma_iter_end(di)) {
+		struct dma_fence *fence = NULL;
+		struct kfd_bo *dst_bo = di->cur_bo;
+		
+		copy_size = min(size, (di->array->size - di->offset));
+		
+		/* Check both BOs belong to same device */
+		if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+		        pr_err("CMA fail. Not same dev\n");
+		        return -EINVAL;
+		}
+		
+		err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n);
+		if (err) {
+		        pr_err("CMA %d failed\n", err);
+		        break;
+		}
+		
+		if (fence) {
+		        dma_fence_put(lfence);
+		        lfence = fence;
+		}
+		size -= n;
+		*copied += n;
+		err = kfd_cma_iter_advance(si, n);
+		if (err)
+		        break;
+		err = kfd_cma_iter_advance(di, n);
+		if (err)
+		        break;
+	}
+	
+	if (f)
+	        *f = dma_fence_get(lfence);
+	dma_fence_put(lfence);
+	
+	return err;
 }
 
 static int kfd_ioctl_cross_memory_copy(struct file *filep,
@@ -1910,22 +2071,22 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep,
 		cma_op = "WRITE";
 		pr_debug("CMA WRITE: local -> remote\n");
                 err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
-                                        remote_p, &di);
+                                        remote_p, remote_mm, remote_task, &di);
                 if (err)
                         goto kfd_process_fail;
                 err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
-                                        local_p, &si);
+                                        local_p, current->mm, current, &si);
                 if (err)
                         goto kfd_process_fail;
 	} else {
 		cma_op = "READ";
 		pr_debug("CMA READ: remote -> local\n");
                 err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
-                                        local_p, &di);
+                                        local_p, current->mm, current, &di);
                 if (err)
                         goto kfd_process_fail;
                 err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
-                                        remote_p, &si);
+                                        remote_p, remote_mm, remote_task, &si);
                 if (err)
                         goto kfd_process_fail;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 8adfe21..93462fa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -316,6 +316,8 @@ struct cma_iter {
 	/* offset into the entry pointed by cma_iter.array */
 	unsigned long offset;
 	struct kfd_process *p;
+	struct mm_struct *mm;
+	struct task_struct *task;
 	/* current kfd_bo associated with cma_iter.array.va_addr */
 	struct kfd_bo *cur_bo;
 	/* offset w.r.t cur_bo */
diff --git a/mm/gup.c b/mm/gup.c
index 4cc8a6f..35c9f0b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1081,6 +1081,17 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
+long get_user_pages_remote_locked(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long start, unsigned long nr_pages,
+                unsigned int gup_flags, struct page **pages,
+                struct vm_area_struct **vmas, int *locked)
+{
+        return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+                                       locked, false,
+                                       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote_locked);
+
 /*
  * This is the same as get_user_pages_remote(), just with a
  * less-flexible calling convention where we assume that the task
-- 
2.7.4