meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/1159-drm-amdkfd-Flush-TC-for-GFX-v7.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

From a4f81fb2804858867dcc2d0bf338c76a09867a36 Mon Sep 17 00:00:00 2001
From: Amber Lin <Amber.Lin@amd.com>
Date: Fri, 8 Jul 2016 16:18:02 -0400
Subject: [PATCH 1159/4131] drm/amdkfd: Flush TC for GFX v7

GFX v7 doesn't flush texture cache at DEQUEUE if any dirty cache remains.
This patch submits an IB packet of RELEASE_MEM command to flush the cache
before tearing down VMID. For each process, One page below CWSR memory is
reserved for IB usage.

BUG: SWDEV-93847

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  17 +++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  19 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  37 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 115 +++++++++++++++++++++
 6 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 666853e..af3790f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -332,6 +332,21 @@ static void kfd_cwsr_fini(struct kfd_dev *kfd)
 		__free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size));
 }
 
+static void kfd_ib_mem_init(struct kfd_dev *kdev)
+{
+	/* In certain cases we need to send IB from kernel using the GPU address
+	 * space created by user applications.
+	 * For example, on GFX v7, we need to flush TC associated to the VMID
+	 * before tearing down the VMID. In order to do so, we need an address
+	 * valid to the VMID to place the IB while this space was created on
+	 * the user's side, not the kernel.
+	 * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size"
+	 * but CWSR only uses pages above cwsr_base, we'll use one page memory
+	 * under cwsr_base for IB submissions
+	 */
+	kdev->ib_size = PAGE_SIZE;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static int kfd_debugfs_open(struct inode *inode, struct file *file)
@@ -501,6 +516,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	if (kfd_cwsr_init(kfd))
 		goto device_iommu_pasid_error;
 
+	kfd_ib_mem_init(kfd);
+
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index aacc4dc..1506597 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -138,12 +138,31 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 	return 0;
 }
 
+static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
+				struct qcm_process_device *qpd)
+{
+	uint32_t len;
+
+	if (!qpd->ib_kaddr)
+		return -ENOMEM;
+
+	len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+
+	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
+				qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
+}
+
 static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
 	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
 
+	/* On GFX v7, CP doesn't flush TC at dequeue */
+	if (q->device->device_info->asic_family == CHIP_HAWAII)
+		if (flush_texture_cache_nocpsch(q->device, qpd))
+			pr_err("kfd: Failed to flush TC\n");
+
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 587f847..c52853f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -295,6 +295,7 @@
 
 
 #define DGPU_VM_BASE_DEFAULT 0x100000
+#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE)
 
 int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
 					uint64_t base, uint64_t limit)
@@ -355,9 +356,10 @@ int kfd_init_apertures(struct kfd_process *process)
 			pdd->scratch_limit =
 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
 
-			if (KFD_IS_DGPU(dev->device_info->asic_family))
+			if (KFD_IS_DGPU(dev->device_info->asic_family)) {
 				pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
-
+				pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT;
+			}
 		}
 
 		dev_dbg(kfd_device, "node id %u\n", id);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index ea0dcd1..47071cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -556,6 +556,43 @@ static int get_map_process_packet_size_scratch(void)
 	return sizeof(struct pm4_map_process_scratch);
 }
 
+/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
+ *	of this packet
+ *	@gpu_addr - GPU address of the packet. It's a virtual address.
+ *	@buffer - buffer to fill up with the packet. It's a CPU kernel pointer
+ *	Return - length of the packet
+ */
+uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
+{
+	struct pm4__release_mem *packet;
+
+	WARN_ON(!buffer);
+
+	packet = (struct pm4__release_mem *)buffer;
+	memset(buffer, 0, sizeof(struct pm4__release_mem));
+
+	packet->header.u32all = build_pm4_header(IT_RELEASE_MEM,
+					sizeof(struct pm4__release_mem));
+
+	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+	packet->bitfields2.tcl1_action_ena = 1;
+	packet->bitfields2.tc_action_ena = 1;
+	packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+	packet->bitfields2.atc = 0;
+
+	packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
+	packet->bitfields3.int_sel =
+		int_sel___release_mem__send_interrupt_after_write_confirm;
+
+	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+	packet->address_hi = upper_32_bits(gpu_addr);
+
+	packet->data_lo = 0;
+
+	return sizeof(struct pm4__release_mem) / sizeof(unsigned int);
+}
+
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
 		uint16_t fw_ver)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a222efc..3814e5a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -271,6 +271,9 @@ struct kfd_dev {
 	uint32_t cwsr_size;
 	uint32_t tma_offset;  /*Offset for TMA from the  start of cwsr_mem*/
 
+	/* IB usage */
+	uint32_t ib_size;
+
 	/* Debugfs */
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry *debugfs_root;
@@ -529,6 +532,10 @@ struct qcm_process_device {
 	uint64_t tba_addr;
 	uint64_t tma_addr;
 	void *cwsr_kaddr;
+
+	/* IB memory */
+	uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */
+	void *ib_kaddr;
 };
 
 /*8 byte handle containing GPU ID in the most significant 4 bytes and
@@ -842,6 +849,7 @@ struct packet_manager_firmware {
 	int (*get_map_process_packet_size)(void);
 };
 
+uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer);
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
 		uint16_t fw_ver);
 void pm_uninit(struct packet_manager *pm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ff1669b..9b67aaf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -79,6 +79,120 @@ void kfd_process_destroy_wq(void)
 	}
 }
 
+static void kfd_process_free_gpuvm(struct kfd_dev *kdev, struct kgd_mem *mem,
+				void *vm)
+{
+	kdev->kfd2kgd->unmap_memory_to_gpu(kdev->kgd, mem, vm);
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
+}
+
+/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
+ *	During the memory allocation of GPU, we can't hold the process lock.
+ *	There's a chance someone else allocates the memory during the lock
+ *	released time. In that case, -EINVAL is returned but kptr remains so
+ *	the caller knows the memory is allocated (by someone else) and
+ *	available to use.
+ */
+static int kfd_process_alloc_gpuvm(struct kfd_process *p,
+		struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size,
+		void *vm, void **kptr, struct kfd_process_device *pdd,
+		uint64_t *addr_to_assign)
+{
+	int err;
+	void *mem = NULL;
+
+	/* can't hold the process lock while allocating from KGD */
+	up_write(&p->lock);
+
+	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, vm,
+				(struct kgd_mem **)&mem, NULL, kptr, pdd,
+				ALLOC_MEM_FLAGS_GTT |
+				ALLOC_MEM_FLAGS_NONPAGED |
+				ALLOC_MEM_FLAGS_EXECUTE_ACCESS |
+				ALLOC_MEM_FLAGS_NO_SUBSTITUTE);
+	if (err)
+		goto err_alloc_mem;
+
+	err = kfd_map_memory_to_gpu(kdev, mem, p, pdd);
+	if (err)
+		goto err_map_mem;
+
+	down_write(&p->lock);
+	/* Check if someone else allocated the memory while we weren't looking
+	 */
+	if (*addr_to_assign) {
+		err = -EINVAL;
+		goto free_gpuvm;
+	} else {
+		/* Create an obj handle so kfd_process_device_remove_obj_handle
+		 * will take care of the bo removal when the process finishes
+		 */
+		if (kfd_process_device_create_obj_handle(
+				pdd, mem, gpu_va, size) < 0) {
+			err = -ENOMEM;
+			*kptr = NULL;
+			goto free_gpuvm;
+		}
+	}
+
+	return err;
+
+free_gpuvm:
+	up_write(&p->lock);
+	kfd_process_free_gpuvm(kdev, (struct kgd_mem *)mem, pdd->vm);
+	down_write(&p->lock);
+	return err;
+
+err_map_mem:
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
+err_alloc_mem:
+	*kptr = NULL;
+	down_write(&p->lock);
+	return err;
+}
+
+/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage
+ *	The memory reserved is for KFD to submit IB to AMDGPU from kernel.
+ *	If the memory is reserved successfully, ib_kaddr_assigned will have
+ *	the CPU/kernel address. Check ib_kaddr_assigned before accessing the
+ *	memory.
+ */
+static int kfd_process_reserve_ib_mem(struct kfd_process *p)
+{
+	int err = 0;
+	struct kfd_process_device *temp, *pdd = NULL;
+	struct kfd_dev *kdev = NULL;
+	struct qcm_process_device *qpd = NULL;
+	void *kaddr;
+
+	down_write(&p->lock);
+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+				per_device_list) {
+		kdev = pdd->dev;
+		qpd = &pdd->qpd;
+		if (!kdev->ib_size || qpd->ib_kaddr)
+			continue;
+
+		if (qpd->ib_base) { /* is dGPU */
+			err = kfd_process_alloc_gpuvm(p, kdev,
+				qpd->ib_base, kdev->ib_size, pdd->vm,
+				&kaddr, pdd, (uint64_t *)&qpd->ib_kaddr);
+			if (!err)
+				qpd->ib_kaddr = kaddr;
+			else if (qpd->ib_kaddr)
+				err = 0;
+			else
+				err = -ENOMEM;
+		} else {
+			/* FIXME: Support APU */
+			err = -ENOMEM;
+		}
+	}
+
+	up_write(&p->lock);
+	return err;
+}
+
 struct kfd_process *kfd_create_process(struct file *filep)
 {
 	struct kfd_process *process;
@@ -117,6 +231,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
 	up_write(&thread->mm->mmap_sem);
 
 	kfd_process_init_cwsr(process, filep);
+	kfd_process_reserve_ib_mem(process);
 
 	return process;
 }
-- 
2.7.4