aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1737-drm-amdgpu-Optimize-page-directory-updates-for-KFD.patch
blob: f332227747b868fecdba0e3d6b154699153f1f28 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
From b93a8bfd66d0254e2920f9d1648fa994b0d821a9 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 11 Jul 2017 19:56:49 -0400
Subject: [PATCH 1737/4131] drm/amdgpu: Optimize page directory updates for KFD

Updating 4-level page tables is expensive when lots of virtual
address space is used and thousands of page table BOs allocated. So
avoid doing it unnecessarily. Move page table allocation into
add_bo_to_vm and page directory update into vm_validate_pt_pd_bos.
The latter is called by add_bo_to_vm and when restoring from
evictions.

This reduces page directory update frequency from once per mapping
to only the first mapping per GPU. On restore from evictions, it
reduces page directory updates from once per BO mapping to once
per GPU.

Also stop moving PT BOs in the LRU list. This was originally added
when we were still pinning memory to remove PT BOs from the LRU list.
Now it's just a pointless move, which is very expensive when there
are thousands of BOs.

Change-Id: If23e2f30e665511510bad705346543de82cc9cfe
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

 Conflicts:
	drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 231 ++++++++++++-----------
 1 file changed, 118 insertions(+), 113 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 01414bc..505d006 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -335,68 +335,6 @@ static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo,
 	kfree(ef_list);
 }
 
-static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
-		struct amdgpu_vm *avm, bool is_aql,
-		struct kfd_bo_va_list **p_bo_va_entry)
-{
-	int ret;
-	struct kfd_bo_va_list *bo_va_entry;
-	struct amdgpu_bo *bo = mem->bo;
-	uint64_t va = mem->va;
-	struct list_head *list_bo_va = &mem->bo_va_list;
-	unsigned long bo_size = bo->tbo.mem.size;
-
-	if (is_aql)
-		va += bo_size;
-
-	bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
-	if (!bo_va_entry)
-		return -ENOMEM;
-
-	if (!va) {
-		pr_err("Invalid VA when adding BO to VM\n");
-		return -EINVAL;
-	}
-
-	pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
-			va + bo_size, avm);
-
-	/* Add BO to VM internal data structures*/
-	bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo);
-	if (bo_va_entry->bo_va == NULL) {
-		ret = -EINVAL;
-		pr_err("Failed to add BO object to VM. ret == %d\n",
-				ret);
-		goto err_vmadd;
-	}
-
-	bo_va_entry->va = va;
-	bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev,
-							 mem->mapping_flags);
-	bo_va_entry->kgd_dev = (void *)adev;
-	list_add(&bo_va_entry->bo_list, list_bo_va);
-
-	if (p_bo_va_entry)
-		*p_bo_va_entry = bo_va_entry;
-
-	return 0;
-
-err_vmadd:
-	kfree(bo_va_entry);
-	return ret;
-}
-
-static void remove_bo_from_vm(struct amdgpu_device *adev,
-		struct kfd_bo_va_list *entry, unsigned long size)
-{
-	pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n",
-			entry->va,
-			entry->va + size, entry);
-	amdgpu_vm_bo_rmv(adev, entry->bo_va);
-	list_del(&entry->bo_list);
-	kfree(entry);
-}
-
 static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
 				     bool wait)
 {
@@ -435,6 +373,12 @@ static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo)
 	return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait);
 }
 
+/* vm_validate_pt_pd_bos - Validate page table and directory BOs
+ *
+ * Also updates page directory entries so we don't need to do this
+ * again later until the page directory is validated again (e.g. after
+ * an eviction or allocating new page tables).
+ */
 static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
 {
 	struct amdgpu_bo *pd = vm->root.bo;
@@ -460,7 +404,116 @@ static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
 
 	vm->last_eviction_counter = atomic64_read(&adev->num_evictions);
 
+	ret = amdgpu_vm_update_directories(adev, vm);
+	if (ret != 0)
+		return ret;
+
+	return 0;
+}
+
+/* add_bo_to_vm - Add a BO to a VM
+ *
+ * Everything that needs to bo done only once when a BO is first added
+ * to a VM. It can later be mapped and unmapped many times without
+ * repeating these steps.
+ *
+ * 1. Allocate and initialize BO VA entry data structure
+ * 2. Add BO to the VM
+ * 3. Determine ASIC-specific PTE flags
+ * 4. Alloc page tables and directories if needed
+ * 4a.  Validate new page tables and directories and update directories
+ */
+static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
+		struct amdgpu_vm *avm, bool is_aql,
+		struct kfd_bo_va_list **p_bo_va_entry)
+{
+	int ret;
+	struct kfd_bo_va_list *bo_va_entry;
+	struct amdkfd_vm *kvm = container_of(avm,
+					     struct amdkfd_vm, base);
+	struct amdgpu_bo *pd = avm->root.bo;
+	struct amdgpu_bo *bo = mem->bo;
+	uint64_t va = mem->va;
+	struct list_head *list_bo_va = &mem->bo_va_list;
+	unsigned long bo_size = bo->tbo.mem.size;
+
+	if (!va) {
+		pr_err("Invalid VA when adding BO to VM\n");
+		return -EINVAL;
+	}
+
+	if (is_aql)
+		va += bo_size;
+
+	bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
+	if (!bo_va_entry)
+		return -ENOMEM;
+
+	pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
+			va + bo_size, avm);
+
+	/* Add BO to VM internal data structures*/
+	bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, avm, bo);
+	if (bo_va_entry->bo_va == NULL) {
+		ret = -EINVAL;
+		pr_err("Failed to add BO object to VM. ret == %d\n",
+				ret);
+		goto err_vmadd;
+	}
+
+	bo_va_entry->va = va;
+	bo_va_entry->pte_flags = amdgpu_vm_get_pte_flags(adev,
+							 mem->mapping_flags);
+	bo_va_entry->kgd_dev = (void *)adev;
+	list_add(&bo_va_entry->bo_list, list_bo_va);
+
+	if (p_bo_va_entry)
+		*p_bo_va_entry = bo_va_entry;
+
+	/* Allocate new page tables if neeeded and validate
+	 * them. Clearing of new page tables and validate need to wait
+	 * on move fences. We don't want that to trigger the eviction
+	 * fence, so remove it temporarily.
+	 */
+	amdgpu_amdkfd_remove_eviction_fence(pd,
+					kvm->process_info->eviction_fence,
+					NULL, NULL);
+
+	ret = amdgpu_vm_alloc_pts(adev, avm, va, amdgpu_bo_size(bo));
+	if (ret) {
+		pr_err("Failed to allocate pts, err=%d\n", ret);
+		goto err_alloc_pts;
+	}
+
+	ret = vm_validate_pt_pd_bos(avm);
+	if (ret != 0) {
+		pr_err("validate_pt_pd_bos() failed\n");
+		goto err_alloc_pts;
+	}
+
+	/* Add the eviction fence back */
+	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
+
 	return 0;
+
+err_alloc_pts:
+	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
+	amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va);
+	list_del(&bo_va_entry->bo_list);
+err_vmadd:
+	kfree(bo_va_entry);
+	return ret;
+}
+
+static void remove_bo_from_vm(struct amdgpu_device *adev,
+		struct kfd_bo_va_list *entry, unsigned long size)
+{
+	pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n",
+			entry->va,
+			entry->va + size, entry);
+	amdgpu_vm_bo_rmv(adev, entry->bo_va);
+	list_del(&entry->bo_list);
+	kfree(entry);
 }
 
 static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
@@ -940,15 +993,6 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
 	vm = bo_va->vm;
 	bo = bo_va->bo;
 
-	/* Update the page directory */
-	ret = amdgpu_vm_update_directories(adev, vm);
-	if (ret != 0) {
-		pr_err("amdgpu_vm_update_directories failed\n");
-		return ret;
-	}
-
-	amdgpu_sync_fence(adev, sync, vm->last_dir_update);
-
 	/* Update the page tables  */
 	ret = amdgpu_vm_bo_update(adev, bo_va, false);
 	if (ret != 0) {
@@ -958,9 +1002,6 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
 
 	amdgpu_sync_fence(adev, sync, bo_va->last_pt_update);
 
-	/* Remove PTs from LRU list (reservation removed PD only) */
-	amdgpu_vm_move_pt_bos_in_lru(adev, vm);
-
 	/* Sync objects can't handle multiple GPUs (contexts) updating
 	 * sync->last_vm_update. Fortunately we don't need it for
 	 * KFD's purposes, so we can just drop that fence.
@@ -978,52 +1019,16 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev,
 		bool no_update_pte)
 {
 	int ret;
-	struct amdgpu_bo *bo = entry->bo_va->bo;
-	struct amdkfd_vm *kvm = container_of(entry->bo_va->vm,
-					     struct amdkfd_vm, base);
-	struct amdgpu_bo *pd = entry->bo_va->vm->root.bo;
 
-	/* Remove eviction fence from PD (and thereby from PTs too as they
-	 * share the resv. object. This is necessary because new PTs are
-	 * cleared and validate needs to wait on move fences. The eviction
-	 * fence shouldn't interfere in both these activities
-	 */
-	amdgpu_amdkfd_remove_eviction_fence(pd,
-					kvm->process_info->eviction_fence,
-					NULL, NULL);
-
-	ret = amdgpu_vm_alloc_pts(adev, entry->bo_va->vm, entry->va,
-				  amdgpu_bo_size(bo));
-
-	if (ret) {
-		pr_err("Failed to allocate pts, err=%d\n", ret);
-		return ret;
-	}
-
-	/* Set virtual address for the allocation, allocate PTs,
-	 * if needed, and zero them.
-	 */
-	ret = amdgpu_vm_bo_map(adev, entry->bo_va,
-			entry->va, 0, amdgpu_bo_size(bo),
-			entry->pte_flags);
+	/* Set virtual address for the allocation */
+	ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0,
+			amdgpu_bo_size(entry->bo_va->bo), entry->pte_flags);
 	if (ret != 0) {
 		pr_err("Failed to map VA 0x%llx in vm. ret %d\n",
 				entry->va, ret);
 		return ret;
 	}
 
-	/* PT BOs may be created during amdgpu_vm_bo_map() call,
-	 * so we have to validate the newly created PT BOs.
-	 */
-	ret = vm_validate_pt_pd_bos(entry->bo_va->vm);
-	if (ret != 0) {
-		pr_err("validate_pt_pd_bos() failed\n");
-		return ret;
-	}
-
-	/* Add the eviction fence back */
-	amdgpu_bo_fence(pd, &kvm->process_info->eviction_fence->base, true);
-
 	if (no_update_pte)
 		return 0;
 
-- 
2.7.4