1 files changed, 335 insertions, 0 deletions
diff --git a/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1473-drm-amdkfd-Flush-TC-for-GFX-v7.patch b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1473-drm-amdkfd-Flush-TC-for-GFX-v7.patch
new file mode 100644
index 00000000..1eb0f0e2
--- /dev/null
+++ b/meta-v1000/recipes-kernel/linux/linux-yocto-4.14.71/1473-drm-amdkfd-Flush-TC-for-GFX-v7.patch
@@ -0,0 +1,335 @@
+From f038d18b7f4a5d69740d8c9bf2c8e67721753c01 Mon Sep 17 00:00:00 2001
+From: Amber Lin <Amber.Lin@amd.com>
+Date: Fri, 8 Jul 2016 16:18:02 -0400
+Subject: [PATCH 1473/4131] drm/amdkfd: Flush TC for GFX v7
+
+GFX v7 doesn't flush texture cache at DEQUEUE if any dirty cache remains.
+This patch submits an IB packet of RELEASE_MEM command to flush the cache
+before tearing down VMID. For each process, One page below CWSR memory is
+reserved for IB usage.
+
+BUG: SWDEV-93847
+
+Signed-off-by: Amber Lin <Amber.Lin@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  17 +++
+ .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  19 ++++
+ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  37 +++++++
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   8 ++
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 115 +++++++++++++++++++++
+ 6 files changed, 200 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+index 666853e..af3790f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -332,6 +332,21 @@ static void kfd_cwsr_fini(struct kfd_dev *kfd)
+ 		__free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size));
+ }
+ 
++static void kfd_ib_mem_init(struct kfd_dev *kdev)
++{
++	/* In certain cases we need to send IB from kernel using the GPU address
++	 * space created by user applications.
++	 * For example, on GFX v7, we need to flush TC associated to the VMID
++	 * before tearing down the VMID. In order to do so, we need an address
++	 * valid to the VMID to place the IB while this space was created on
++	 * the user's side, not the kernel.
++	 * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size"
++	 * but CWSR only uses pages above cwsr_base, we'll use one page memory
++	 * under cwsr_base for IB submissions
++	 */
++	kdev->ib_size = PAGE_SIZE;
++}
++
+ #if defined(CONFIG_DEBUG_FS)
+ 
+ static int kfd_debugfs_open(struct inode *inode, struct file *file)
+@@ -501,6 +516,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
+ 	if (kfd_cwsr_init(kfd))
+ 		goto device_iommu_pasid_error;
+ 
++	kfd_ib_mem_init(kfd);
++
+ 	if (kfd_resume(kfd))
+ 		goto kfd_resume_error;
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index aacc4dc..1506597 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -138,12 +138,31 @@ static int allocate_vmid(struct device_queue_manager *dqm,
+ 	return 0;
+ }
+ 
++static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
++				struct qcm_process_device *qpd)
++{
++	uint32_t len;
++
++	if (!qpd->ib_kaddr)
++		return -ENOMEM;
++
++	len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
++
++	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
++				qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
++}
++
+ static void deallocate_vmid(struct device_queue_manager *dqm,
+ 				struct qcm_process_device *qpd,
+ 				struct queue *q)
+ {
+ 	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
+ 
++	/* On GFX v7, CP doesn't flush TC at dequeue */
++	if (q->device->device_info->asic_family == CHIP_HAWAII)
++		if (flush_texture_cache_nocpsch(q->device, qpd))
++			pr_err("kfd: Failed to flush TC\n");
++
+ 	/* Release the vmid mapping */
+ 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+ 
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+index 587f847..c52853f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+@@ -295,6 +295,7 @@
+ 
+ 
+ #define DGPU_VM_BASE_DEFAULT 0x100000
++#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE)
+ 
+ int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
+ 					uint64_t base, uint64_t limit)
+@@ -355,9 +356,10 @@ int kfd_init_apertures(struct kfd_process *process)
+ 			pdd->scratch_limit =
+ 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
+ 
+-			if (KFD_IS_DGPU(dev->device_info->asic_family))
++			if (KFD_IS_DGPU(dev->device_info->asic_family)) {
+ 				pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
+-
++				pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT;
++			}
+ 		}
+ 
+ 		dev_dbg(kfd_device, "node id %u\n", id);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index f777645..50d015f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -554,6 +554,43 @@ static int get_map_process_packet_size_scratch(void)
+ 	return sizeof(struct pm4_map_process_scratch);
+ }
+ 
++/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
++ *	of this packet
++ *	@gpu_addr - GPU address of the packet. It's a virtual address.
++ *	@buffer - buffer to fill up with the packet. It's a CPU kernel pointer
++ *	Return - length of the packet
++ */
++uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
++{
++	struct pm4__release_mem *packet;
++
++	WARN_ON(!buffer);
++
++	packet = (struct pm4__release_mem *)buffer;
++	memset(buffer, 0, sizeof(struct pm4__release_mem));
++
++	packet->header.u32all = build_pm4_header(IT_RELEASE_MEM,
++					sizeof(struct pm4__release_mem));
++
++	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
++	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
++	packet->bitfields2.tcl1_action_ena = 1;
++	packet->bitfields2.tc_action_ena = 1;
++	packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
++	packet->bitfields2.atc = 0;
++
++	packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
++	packet->bitfields3.int_sel =
++		int_sel___release_mem__send_interrupt_after_write_confirm;
++
++	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
++	packet->address_hi = upper_32_bits(gpu_addr);
++
++	packet->data_lo = 0;
++
++	return sizeof(struct pm4__release_mem) / sizeof(unsigned int);
++}
++
+ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+ 		uint16_t fw_ver)
+ {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index d19fd6b..e702ed5 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -271,6 +271,9 @@ struct kfd_dev {
+ 	uint32_t cwsr_size;
+ 	uint32_t tma_offset;  /*Offset for TMA from the  start of cwsr_mem*/
+ 
++	/* IB usage */
++	uint32_t ib_size;
++
+ 	/* Debugfs */
+ #if defined(CONFIG_DEBUG_FS)
+ 	struct dentry *debugfs_root;
+@@ -529,6 +532,10 @@ struct qcm_process_device {
+ 	uint64_t tba_addr;
+ 	uint64_t tma_addr;
+ 	void *cwsr_kaddr;
++
++	/* IB memory */
++	uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */
++	void *ib_kaddr;
+ };
+ 
+ /*8 byte handle containing GPU ID in the most significant 4 bytes and
+@@ -842,6 +849,7 @@ struct packet_manager_firmware {
+ 	int (*get_map_process_packet_size)(void);
+ };
+ 
++uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer);
+ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
+ 		uint16_t fw_ver);
+ void pm_uninit(struct packet_manager *pm);
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index ff1669b..9b67aaf 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -79,6 +79,120 @@ void kfd_process_destroy_wq(void)
+ 	}
+ }
+ 
++static void kfd_process_free_gpuvm(struct kfd_dev *kdev, struct kgd_mem *mem,
++				void *vm)
++{
++	kdev->kfd2kgd->unmap_memory_to_gpu(kdev->kgd, mem, vm);
++	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
++}
++
++/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
++ *	During the memory allocation of GPU, we can't hold the process lock.
++ *	There's a chance someone else allocates the memory during the lock
++ *	released time. In that case, -EINVAL is returned but kptr remains so
++ *	the caller knows the memory is allocated (by someone else) and
++ *	available to use.
++ */
++static int kfd_process_alloc_gpuvm(struct kfd_process *p,
++		struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size,
++		void *vm, void **kptr, struct kfd_process_device *pdd,
++		uint64_t *addr_to_assign)
++{
++	int err;
++	void *mem = NULL;
++
++	/* can't hold the process lock while allocating from KGD */
++	up_write(&p->lock);
++
++	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, vm,
++				(struct kgd_mem **)&mem, NULL, kptr, pdd,
++				ALLOC_MEM_FLAGS_GTT |
++				ALLOC_MEM_FLAGS_NONPAGED |
++				ALLOC_MEM_FLAGS_EXECUTE_ACCESS |
++				ALLOC_MEM_FLAGS_NO_SUBSTITUTE);
++	if (err)
++		goto err_alloc_mem;
++
++	err = kfd_map_memory_to_gpu(kdev, mem, p, pdd);
++	if (err)
++		goto err_map_mem;
++
++	down_write(&p->lock);
++	/* Check if someone else allocated the memory while we weren't looking
++	 */
++	if (*addr_to_assign) {
++		err = -EINVAL;
++		goto free_gpuvm;
++	} else {
++		/* Create an obj handle so kfd_process_device_remove_obj_handle
++		 * will take care of the bo removal when the process finishes
++		 */
++		if (kfd_process_device_create_obj_handle(
++				pdd, mem, gpu_va, size) < 0) {
++			err = -ENOMEM;
++			*kptr = NULL;
++			goto free_gpuvm;
++		}
++	}
++
++	return err;
++
++free_gpuvm:
++	up_write(&p->lock);
++	kfd_process_free_gpuvm(kdev, (struct kgd_mem *)mem, pdd->vm);
++	down_write(&p->lock);
++	return err;
++
++err_map_mem:
++	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
++err_alloc_mem:
++	*kptr = NULL;
++	down_write(&p->lock);
++	return err;
++}
++
++/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage
++ *	The memory reserved is for KFD to submit IB to AMDGPU from kernel.
++ *	If the memory is reserved successfully, ib_kaddr_assigned will have
++ *	the CPU/kernel address. Check ib_kaddr_assigned before accessing the
++ *	memory.
++ */
++static int kfd_process_reserve_ib_mem(struct kfd_process *p)
++{
++	int err = 0;
++	struct kfd_process_device *temp, *pdd = NULL;
++	struct kfd_dev *kdev = NULL;
++	struct qcm_process_device *qpd = NULL;
++	void *kaddr;
++
++	down_write(&p->lock);
++	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
++				per_device_list) {
++		kdev = pdd->dev;
++		qpd = &pdd->qpd;
++		if (!kdev->ib_size || qpd->ib_kaddr)
++			continue;
++
++		if (qpd->ib_base) { /* is dGPU */
++			err = kfd_process_alloc_gpuvm(p, kdev,
++				qpd->ib_base, kdev->ib_size, pdd->vm,
++				&kaddr, pdd, (uint64_t *)&qpd->ib_kaddr);
++			if (!err)
++				qpd->ib_kaddr = kaddr;
++			else if (qpd->ib_kaddr)
++				err = 0;
++			else
++				err = -ENOMEM;
++		} else {
++			/* FIXME: Support APU */
++			err = -ENOMEM;
++		}
++	}
++
++	up_write(&p->lock);
++	return err;
++}
++
+ struct kfd_process *kfd_create_process(struct file *filep)
+ {
+ 	struct kfd_process *process;
+@@ -117,6 +231,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
+ 	up_write(&thread->mm->mmap_sem);
+ 
+ 	kfd_process_init_cwsr(process, filep);
++	kfd_process_reserve_ib_mem(process);
+ 
+ 	return process;
+ }
+-- 
+2.7.4
+