aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.14.71/4162-drm-amdkfd-CMA-Handle-userptr-to-userptr-BO-copy.patch
blob: 0ab2b27c63e3aed4b5e572dd39388a53ca05bd8e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
From 4d98ca2586f4857e43946b29175cb5d953d79b15 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Mon, 26 Mar 2018 16:45:06 -0400
Subject: [PATCH 4162/5725] drm/amdkfd: CMA: Handle userptr to userptr BO copy

CMA userptr implementations are incomplete because it doesn't properly
handle if the BO is evicted. This patch handles the case where both
source and destination BOs are userptr. It is more efficient to use CPU
to do the copy in this case, very similar to process_vm_read/write()
functions.

Change-Id: I5d01d906f04190d71e8663785718060411dede4e
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Kalyan Alle <kalyan.alle@amd.com>

Conflicts:
      drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 179 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |   2 +
 2 files changed, 172 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index ef1bd27..bd09647 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -35,6 +35,7 @@
 #include <linux/mman.h>
 #include <asm/processor.h>
 #include <linux/ptrace.h>
+#include <linux/pagemap.h>
 
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
@@ -1714,6 +1715,12 @@ static int kfd_ioctl_ipc_import_handle(struct file *filep,
 	return r;
 }
 
+/* Maximum number of entries for process pages array which lives on stack */
+#define MAX_PP_STACK_COUNT 16
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+#define MAX_PP_KMALLOC_COUNT (MAX_KMALLOC_PAGES/sizeof(struct page *))
+
 /* Update cma_iter.cur_bo with KFD BO that is assocaited with
  * cma_iter.array.va_addr
  */
@@ -1762,7 +1769,8 @@ static int kfd_cma_iter_advance(struct cma_iter *ci, unsigned long size)
 }
 
 static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
-				struct kfd_process *p, struct cma_iter *ci)
+				struct kfd_process *p, struct mm_struct *mm,
+				struct task_struct *task, struct cma_iter *ci)
 {
 	int ret;
 	int nr;
@@ -1775,6 +1783,8 @@ static int kfd_cma_iter_init(struct kfd_memory_range *arr, unsigned long segs,
 	ci->nr_segs = segs;
 	ci->p = p;
 	ci->offset = 0;
+	ci->mm = mm;
+	ci->task = task;
 	for (nr = 0; nr < segs; nr++)
 		ci->total += arr[nr].size;
 
@@ -1795,6 +1805,159 @@ static bool kfd_cma_iter_end(struct cma_iter *ci)
 	return false;
 }
 
+/* Copies @size bytes from si->cur_bo to di->cur_bo BO. The function assumes
+ * both source and dest. BOs are userptr BOs. Both BOs can either belong to
+ * current process or one of the BOs can belong to a differnt
+ * process. @Returns 0 on success, -ve on failure
+ *
+ * @si: Source iter
+ * @di: Dest. iter
+ * @cma_write: Indicates if it is write to remote or read from remote
+ * @size: amount of bytes to be copied
+ * @copied: Return number of bytes actually copied.
+ */
+static int kfd_copy_userptr_bos(struct cma_iter *si, struct cma_iter *di,
+				bool cma_write, uint64_t size,
+				uint64_t *copied)
+{
+	int i, ret = 0, locked;
+	unsigned int nents, nl;
+	unsigned int offset_in_page;
+	struct page *pp_stack[MAX_PP_STACK_COUNT];
+	struct page **process_pages = pp_stack;
+	unsigned long rva, lva = 0, flags = 0;
+	uint64_t copy_size, to_copy = size;
+	struct cma_iter *li, *ri;
+
+	if (cma_write) {
+		ri = di;
+		li = si;
+		flags |= FOLL_WRITE;
+	} else {
+		li = di;
+		ri = si;
+	}
+	/* rva: remote virtual address. Page aligned to start page.
+	 * rva + offset_in_page: Points to remote start address
+	 * lva: local virtual address. Points to the start address.
+	 * nents: computes number of remote pages to request
+	 */
+	offset_in_page = ri->bo_offset & (PAGE_SIZE - 1);
+	rva = (ri->cur_bo->cpuva + ri->bo_offset) & PAGE_MASK;
+	lva = li->cur_bo->cpuva + li->bo_offset;
+
+	nents = (size + offset_in_page + PAGE_SIZE - 1) / PAGE_SIZE;
+
+	copy_size = min_t(uint64_t, size, PAGE_SIZE - offset_in_page);
+	*copied = 0;
+
+	if (nents > MAX_PP_STACK_COUNT) {
+		/* For reliability kmalloc only 2 pages worth */
+		process_pages = kmalloc(min_t(size_t, MAX_KMALLOC_PAGES,
+					      sizeof(struct pages *)*nents),
+					GFP_KERNEL);
+
+		if (!process_pages)
+			return -ENOMEM;
+	}
+
+	while (nents && to_copy) {
+		nl = min_t(unsigned int, MAX_PP_KMALLOC_COUNT, nents);
+		locked = 1;
+		down_read(&ri->mm->mmap_sem);
+		nl = get_user_pages_remote(ri->task, ri->mm, rva, nl,
+					   flags, process_pages, NULL,
+					   &locked);
+		if (locked)
+			up_read(&ri->mm->mmap_sem);
+		if (nl <= 0) {
+			pr_err("CMA: Invalid virtual address 0x%lx\n", rva);
+			ret = -EFAULT;
+			break;
+		}
+
+		for (i = 0; i < nl; i++) {
+			unsigned int n;
+			void *kaddr = kmap_atomic(process_pages[i]);
+
+			if (cma_write) {
+				n = copy_from_user(kaddr+offset_in_page,
+						   (void *)lva, copy_size);
+				set_page_dirty(process_pages[i]);
+			} else {
+				n = copy_to_user((void *)lva,
+						 kaddr+offset_in_page,
+						 copy_size);
+			}
+			kunmap_atomic(kaddr);
+			if (n) {
+				ret = -EFAULT;
+				break;
+			}
+			to_copy -= copy_size;
+			if (!to_copy)
+				break;
+			lva += copy_size;
+			rva += (copy_size + offset_in_page);
+			WARN_ONCE(rva & (PAGE_SIZE - 1),
+				  "CMA: Error in remote VA computation");
+			offset_in_page = 0;
+			copy_size = min_t(uint64_t, to_copy, PAGE_SIZE);
+		}
+
+		for (i = 0; i < nl; i++)
+			put_page(process_pages[i]);
+
+		if (ret)
+			break;
+		nents -= nl;
+	}
+
+	if (process_pages != pp_stack)
+		kfree(process_pages);
+
+	*copied = (size - to_copy);
+	return ret;
+
+}
+
+/* Copies @size bytes from si->cur_bo to di->cur_bo starting at their
+ * respective offset.
+ * @si: Source iter
+ * @di: Dest. iter
+ * @cma_write: Indicates if it is write to remote or read from remote
+ * @size: amount of bytes to be copied
+ * @f: Return the last fence if any
+ * @copied: Return number of bytes actually copied.
+ */
+static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
+			int cma_write, uint64_t size,
+			struct dma_fence **f, uint64_t *copied)
+{
+	int err = 0;
+	struct kfd_bo *dst_bo = di->cur_bo, *src_bo = si->cur_bo;
+	uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
+	struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
+
+	*copied = 0;
+	if (f)
+		*f = NULL;
+	if (src_bo->cpuva && dst_bo->cpuva)
+		return kfd_copy_userptr_bos(si, di, cma_write, size, copied);
+
+	if (src_bo->dev->kgd != dst_bo->dev->kgd) {
+		pr_err("CMA %d fail. Not same dev\n", cma_write);
+		err = -EINVAL;
+	}
+
+	err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd, src_mem,
+						     src_offset, dst_mem,
+						     dst_offset, size, f,
+						     copied);
+
+	return err;
+}
+
 /* Copy single range from source iterator @si to destination iterator @di.
  * @si will move to next range and @di will move by bytes copied.
  * @return : 0 for success or -ve for failure
@@ -1829,11 +1992,9 @@ static int kfd_copy_single_range(struct cma_iter *si, struct cma_iter *di,
 			return -EINVAL;
 		}
 
-		err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(src_bo->dev->kgd,
-			src_bo->mem, si->bo_offset, dst_bo->mem, di->bo_offset,
-			copy_size, &fence, &n);
+		err = kfd_copy_bos(si, di, cma_write, copy_size, &fence, &n);
 		if (err) {
-			pr_err("GPU CMA %d failed\n", err);
+			pr_err("CMA %d failed\n", err);
 			break;
 		}
 
@@ -1942,11 +2103,11 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep,
 		cma_op = "WRITE";
 		pr_debug("CMA WRITE: local -> remote\n");
 		err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
-					remote_p, &di);
+					remote_p, remote_mm, remote_task, &di);
 		if (err)
 			goto kfd_process_fail;
 		err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
-					local_p, &si);
+					local_p, current->mm, current, &si);
 		if (err)
 			goto kfd_process_fail;
 	} else {
@@ -1954,11 +2115,11 @@ static int kfd_ioctl_cross_memory_copy(struct file *filep,
 		pr_debug("CMA READ: remote -> local\n");
 
 		err = kfd_cma_iter_init(dst_array, args->dst_mem_array_size,
-					local_p, &di);
+					local_p, current->mm, current, &di);
 		if (err)
 			goto kfd_process_fail;
 		err = kfd_cma_iter_init(src_array, args->src_mem_array_size,
-					remote_p, &si);
+					remote_p, remote_mm, remote_task, &si);
 		if (err)
 			goto kfd_process_fail;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index facd9d9..2744154 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -316,6 +316,8 @@ struct cma_iter {
 	/* offset into the entry pointed by cma_iter.array */
 	unsigned long offset;
 	struct kfd_process *p;
+	struct mm_struct *mm;
+	struct task_struct *task;
 	/* current kfd_bo associated with cma_iter.array.va_addr */
 	struct kfd_bo *cur_bo;
 	/* offset w.r.t cur_bo */
-- 
2.7.4