aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/4267-drm-amdkfd-CMA-Support-multi-device-VRAM-copy.patch
blob: 33b352399e8a816727e65499db2461146bc20bcf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
From aae9664f4449916f2f353727bdddceb1e98c3752 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Thu, 12 Apr 2018 14:56:17 -0400
Subject: [PATCH 4267/5725] drm/amdkfd: CMA: Support multi device VRAM copy

Support copy from VRAM on device1 to VRAM on device2. This is done using
an intermediate System BO and double copy.
	[VRAM]--gpu1-->[System BO]--gpu2-->[VRAM]

BUG: SWDEV-150755

Change-Id: I7edf2df3cc1688c1ebd1fa0ea8fa82d39cbf50d1
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 124 +++++++++++++++++++++++--------
 1 file changed, 95 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index b07fe36..66c294a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1831,7 +1831,8 @@ static void kfd_free_cma_bos(struct cma_iter *ci)
 		struct kfd_dev *dev = cma_bo->dev;
 
 		/* sg table is deleted by free_memory_of_gpu */
-		kfd_put_sg_table(cma_bo->sg);
+		if (cma_bo->sg)
+			kfd_put_sg_table(cma_bo->sg);
 		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, cma_bo->mem);
 		list_del(&cma_bo->list);
 		kfree(cma_bo);
@@ -1867,16 +1868,21 @@ static int kfd_fence_put_wait_if_diff_context(struct dma_fence *cf,
 	return ret;
 }
 
-/* Create a system BO by pinning underlying system pages of the given userptr
- * BO @ubo
- * @ubo: Userptr BO
- * @offset: Offset into ubo
+#define MAX_SYSTEM_BO_SIZE (512*PAGE_SIZE)
+
+/* Create an equivalent system BO for the given @bo. If @bo is a userptr then
+ * create a new system BO by pinning underlying system pages of the given
+ * userptr BO. If @bo is in Local Memory then create an empty system BO and
+ * then copy @bo into this new BO.
+ * @bo: Userptr BO or Local Memory BO
+ * @offset: Offset into bo
  * @size: in/out: The size of the new BO could be less than requested if all
- *        the pages couldn't be pinned. This would be reflected in @size
- * @mm/@task: mm/task to which @ubo belongs to
+ *        the pages couldn't be pinned or size > MAX_SYSTEM_BO_SIZE. This would
+ *        be reflected in @size
+ * @mm/@task: mm/task to which @bo belongs to
  * @cma_bo: out: new system BO
  */
-static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
+static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *bo,
 				    uint64_t *size, uint64_t offset,
 				    int cma_write, struct kfd_process *p,
 				    struct mm_struct *mm,
@@ -1886,7 +1892,8 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
 	int ret;
 	struct kfd_process_device *pdd = NULL;
 	struct cma_system_bo *cbo;
-	uint64_t sg_size;
+	uint64_t bo_size = 0;
+	struct dma_fence *f;
 
 	uint32_t flags = ALLOC_MEM_FLAGS_GTT | ALLOC_MEM_FLAGS_NONPAGED |
 			 ALLOC_MEM_FLAGS_NO_SUBSTITUTE;
@@ -1897,40 +1904,75 @@ static int kfd_create_cma_system_bo(struct kfd_dev *kdev, struct kfd_bo *ubo,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&cbo->list);
-	ret = kfd_create_sg_table_from_userptr_bo(ubo, offset, cma_write, mm,
-						  task, size, &sg_size,
-						  &cbo->sg);
-	if (ret) {
-		pr_err("Failed to create system BO. sg table error %d\n", ret);
-		return ret;
+	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+		bo_size = min(*size, MAX_SYSTEM_BO_SIZE);
+	else if (bo->cpuva) {
+		ret = kfd_create_sg_table_from_userptr_bo(bo, offset,
+							  cma_write, mm, task,
+							  size, &bo_size,
+							  &cbo->sg);
+		if (ret) {
+			pr_err("CMA: BO create with sg failed %d\n", ret);
+			goto sg_fail;
+		}
+	} else {
+		WARN_ON(1);
+		ret = -EINVAL;
+		goto sg_fail;
 	}
-
 	mutex_lock(&p->mutex);
 	pdd = kfd_get_process_device_data(kdev, p);
 	if (!pdd) {
+		mutex_unlock(&p->mutex);
 		pr_err("Process device data doesn't exist\n");
 		ret = -EINVAL;
 		goto pdd_fail;
 	}
 
-	ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, sg_size,
+	ret = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, 0ULL, bo_size,
 						 pdd->vm, cbo->sg,
 						 &cbo->mem, NULL, flags);
+	mutex_unlock(&p->mutex);
 	if (ret) {
 		pr_err("Failed to create shadow system BO %d\n", ret);
 		goto pdd_fail;
 	}
-	mutex_unlock(&p->mutex);
+
+	if (bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+		ret = kdev->kfd2kgd->copy_mem_to_mem(kdev->kgd, bo->mem,
+						     offset, cbo->mem, 0,
+						     bo_size, &f, size);
+		if (ret) {
+			pr_err("CMA: Intermediate copy failed %d\n", ret);
+			goto copy_fail;
+		}
+
+		/* Wait for the copy to finish as subsequent copy will be done
+		 * by different device
+		 */
+		ret = kfd_cma_fence_wait(f);
+		dma_fence_put(f);
+		if (ret) {
+			pr_err("CMA: Intermediate copy timed out %d\n", ret);
+			goto copy_fail;
+		}
+	}
+
 	cbo->dev = kdev;
 	*cma_bo = cbo;
 
 	return ret;
 
+copy_fail:
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, bo->mem);
 pdd_fail:
-	mutex_unlock(&p->mutex);
-	kfd_put_sg_table(cbo->sg);
-	sg_free_table(cbo->sg);
-	kfree(cbo->sg);
+	if (cbo->sg) {
+		kfd_put_sg_table(cbo->sg);
+		sg_free_table(cbo->sg);
+		kfree(cbo->sg);
+	}
+sg_fail:
+	kfree(cbo);
 	return ret;
 }
 
@@ -2153,6 +2195,7 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 	uint64_t src_offset = si->bo_offset, dst_offset = di->bo_offset;
 	struct kgd_mem *src_mem = src_bo->mem, *dst_mem = dst_bo->mem;
 	struct kfd_dev *dev = dst_bo->dev;
+	struct cma_system_bo *tmp_bo = NULL;
 
 	*copied = 0;
 	if (f)
@@ -2188,11 +2231,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 		dst_offset = di->bo_offset & (PAGE_SIZE - 1);
 		list_add_tail(&di->cma_bo->list, &di->cma_list);
 	} else if (src_bo->dev->kgd != dst_bo->dev->kgd) {
-		/* This indicates that either or/both BOs are in local mem. */
+		/* This indicates that atleast on of the BO is in local mem.
+		 * If both are in local mem of different devices then create an
+		 * intermediate System BO and do a double copy
+		 * [VRAM]--gpu1-->[System BO]--gpu2-->[VRAM].
+		 * If only one BO is in VRAM then use that GPU to do the copy
+		 */
 		if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM &&
 		    dst_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
-			pr_err("CMA fail. Local mem & not in same dev\n");
-			return -EINVAL;
+			dev = dst_bo->dev;
+			err = kfd_create_cma_system_bo(src_bo->dev, src_bo,
+						       &size, si->bo_offset,
+						       cma_write, si->p,
+						       si->mm, si->task,
+						       &tmp_bo);
+			src_mem = tmp_bo->mem;
+			src_offset = 0;
 		} else if (src_bo->mem_type == KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
 			dev = src_bo->dev;
 		/* else already set to dst_bo->dev */
@@ -2203,10 +2257,22 @@ static int kfd_copy_bos(struct cma_iter *si, struct cma_iter *di,
 		return -EINVAL;
 	}
 
-	err = dst_bo->dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem,
-						     src_offset, dst_mem,
-						     dst_offset, size, f,
-						     copied);
+	err = dev->kfd2kgd->copy_mem_to_mem(dev->kgd, src_mem, src_offset,
+					    dst_mem, dst_offset, size, f,
+					    copied);
+	/* The tmp_bo allocates additional memory. So it is better to wait and
+	 * delete. Also since multiple GPUs are involved the copies are
+	 * currently not pipelined.
+	 */
+	if (tmp_bo) {
+		if (!err) {
+			kfd_cma_fence_wait(*f);
+			dma_fence_put(*f);
+			*f = NULL;
+		}
+		dev->kfd2kgd->free_memory_of_gpu(dev->kgd, tmp_bo->mem);
+		kfree(tmp_bo);
+	}
 	return err;
 }
 
-- 
2.7.4