From f038d18b7f4a5d69740d8c9bf2c8e67721753c01 Mon Sep 17 00:00:00 2001
From: Amber Lin <Amber.Lin@amd.com>
Date: Fri, 8 Jul 2016 16:18:02 -0400
Subject: [PATCH 1473/4131] drm/amdkfd: Flush TC for GFX v7

GFX v7 doesn't flush texture cache at DEQUEUE if any dirty cache remains.
This patch submits an IB packet of RELEASE_MEM command to flush the cache
before tearing down VMID. For each process, One page below CWSR memory is
reserved for IB usage.

BUG: SWDEV-93847

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c            |  17 +++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  |  19 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c       |   6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c    |  37 +++++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h              |   8 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c           | 115 +++++++++++++++++++++
 6 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 666853e..af3790f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -332,6 +332,21 @@ static void kfd_cwsr_fini(struct kfd_dev *kfd)
 		__free_pages(kfd->cwsr_pages, get_order(kfd->cwsr_size));
 }
 
+static void kfd_ib_mem_init(struct kfd_dev *kdev)
+{
+	/* In certain cases we need to send IB from kernel using the GPU address
+	 * space created by user applications.
+	 * For example, on GFX v7, we need to flush TC associated to the VMID
+	 * before tearing down the VMID. In order to do so, we need an address
+	 * valid to the VMID to place the IB while this space was created on
+	 * the user's side, not the kernel.
+	 * Since kfd_set_process_dgpu_aperture reserves "cwsr_base + cwsr_size"
+	 * but CWSR only uses pages above cwsr_base, we'll use one page memory
+	 * under cwsr_base for IB submissions
+	 */
+	kdev->ib_size = PAGE_SIZE;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static int kfd_debugfs_open(struct inode *inode, struct file *file)
@@ -501,6 +516,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	if (kfd_cwsr_init(kfd))
 		goto device_iommu_pasid_error;
 
+	kfd_ib_mem_init(kfd);
+
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index aacc4dc..1506597 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -138,12 +138,31 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 	return 0;
 }
 
+static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
+				struct qcm_process_device *qpd)
+{
+	uint32_t len;
+
+	if (!qpd->ib_kaddr)
+		return -ENOMEM;
+
+	len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
+
+	return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
+				qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
+}
+
 static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
 	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
 
+	/* On GFX v7, CP doesn't flush TC at dequeue */
+	if (q->device->device_info->asic_family == CHIP_HAWAII)
+		if (flush_texture_cache_nocpsch(q->device, qpd))
+			pr_err("kfd: Failed to flush TC\n");
+
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 587f847..c52853f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -295,6 +295,7 @@
 
 
 #define DGPU_VM_BASE_DEFAULT 0x100000
+#define DGPU_IB_BASE_DEFAULT (DGPU_VM_BASE_DEFAULT - PAGE_SIZE)
 
 int kfd_set_process_dgpu_aperture(struct kfd_process_device *pdd,
 					uint64_t base, uint64_t limit)
@@ -355,9 +356,10 @@ int kfd_init_apertures(struct kfd_process *process)
 			pdd->scratch_limit =
 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
 
-			if (KFD_IS_DGPU(dev->device_info->asic_family))
+			if (KFD_IS_DGPU(dev->device_info->asic_family)) {
 				pdd->qpd.cwsr_base = DGPU_VM_BASE_DEFAULT;
-
+				pdd->qpd.ib_base = DGPU_IB_BASE_DEFAULT;
+			}
 		}
 
 		dev_dbg(kfd_device, "node id %u\n", id);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index f777645..50d015f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -554,6 +554,43 @@ static int get_map_process_packet_size_scratch(void)
 	return sizeof(struct pm4_map_process_scratch);
 }
 
+/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
+ *	of this packet
+ *	@gpu_addr - GPU address of the packet. It's a virtual address.
+ *	@buffer - buffer to fill up with the packet. It's a CPU kernel pointer
+ *	Return - length of the packet
+ */
+uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
+{
+	struct pm4__release_mem *packet;
+
+	WARN_ON(!buffer);
+
+	packet = (struct pm4__release_mem *)buffer;
+	memset(buffer, 0, sizeof(struct pm4__release_mem));
+
+	packet->header.u32all = build_pm4_header(IT_RELEASE_MEM,
+					sizeof(struct pm4__release_mem));
+
+	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
+	packet->bitfields2.tcl1_action_ena = 1;
+	packet->bitfields2.tc_action_ena = 1;
+	packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
+	packet->bitfields2.atc = 0;
+
+	packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
+	packet->bitfields3.int_sel =
+		int_sel___release_mem__send_interrupt_after_write_confirm;
+
+	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+	packet->address_hi = upper_32_bits(gpu_addr);
+
+	packet->data_lo = 0;
+
+	return sizeof(struct pm4__release_mem) / sizeof(unsigned int);
+}
+
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
 		uint16_t fw_ver)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d19fd6b..e702ed5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -271,6 +271,9 @@ struct kfd_dev {
 	uint32_t cwsr_size;
 	uint32_t tma_offset;  /*Offset for TMA from the  start of cwsr_mem*/
 
+	/* IB usage */
+	uint32_t ib_size;
+
 	/* Debugfs */
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry *debugfs_root;
@@ -529,6 +532,10 @@ struct qcm_process_device {
 	uint64_t tba_addr;
 	uint64_t tma_addr;
 	void *cwsr_kaddr;
+
+	/* IB memory */
+	uint64_t ib_base; /* ib_base+ib_size must be below cwsr_base */
+	void *ib_kaddr;
 };
 
 /*8 byte handle containing GPU ID in the most significant 4 bytes and
@@ -842,6 +849,7 @@ struct packet_manager_firmware {
 	int (*get_map_process_packet_size)(void);
 };
 
+uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer);
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm,
 		uint16_t fw_ver);
 void pm_uninit(struct packet_manager *pm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ff1669b..9b67aaf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -79,6 +79,120 @@ void kfd_process_destroy_wq(void)
 	}
 }
 
+static void kfd_process_free_gpuvm(struct kfd_dev *kdev, struct kgd_mem *mem,
+				void *vm)
+{
+	kdev->kfd2kgd->unmap_memory_to_gpu(kdev->kgd, mem, vm);
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
+}
+
+/* kfd_process_alloc_gpuvm - Allocate GPU VM for the KFD process
+ *	During the memory allocation of GPU, we can't hold the process lock.
+ *	There's a chance someone else allocates the memory during the lock
+ *	released time. In that case, -EINVAL is returned but kptr remains so
+ *	the caller knows the memory is allocated (by someone else) and
+ *	available to use.
+ */
+static int kfd_process_alloc_gpuvm(struct kfd_process *p,
+		struct kfd_dev *kdev, uint64_t gpu_va, uint32_t size,
+		void *vm, void **kptr, struct kfd_process_device *pdd,
+		uint64_t *addr_to_assign)
+{
+	int err;
+	void *mem = NULL;
+
+	/* can't hold the process lock while allocating from KGD */
+	up_write(&p->lock);
+
+	err = kdev->kfd2kgd->alloc_memory_of_gpu(kdev->kgd, gpu_va, size, vm,
+				(struct kgd_mem **)&mem, NULL, kptr, pdd,
+				ALLOC_MEM_FLAGS_GTT |
+				ALLOC_MEM_FLAGS_NONPAGED |
+				ALLOC_MEM_FLAGS_EXECUTE_ACCESS |
+				ALLOC_MEM_FLAGS_NO_SUBSTITUTE);
+	if (err)
+		goto err_alloc_mem;
+
+	err = kfd_map_memory_to_gpu(kdev, mem, p, pdd);
+	if (err)
+		goto err_map_mem;
+
+	down_write(&p->lock);
+	/* Check if someone else allocated the memory while we weren't looking
+	 */
+	if (*addr_to_assign) {
+		err = -EINVAL;
+		goto free_gpuvm;
+	} else {
+		/* Create an obj handle so kfd_process_device_remove_obj_handle
+		 * will take care of the bo removal when the process finishes
+		 */
+		if (kfd_process_device_create_obj_handle(
+				pdd, mem, gpu_va, size) < 0) {
+			err = -ENOMEM;
+			*kptr = NULL;
+			goto free_gpuvm;
+		}
+	}
+
+	return err;
+
+free_gpuvm:
+	up_write(&p->lock);
+	kfd_process_free_gpuvm(kdev, (struct kgd_mem *)mem, pdd->vm);
+	down_write(&p->lock);
+	return err;
+
+err_map_mem:
+	kdev->kfd2kgd->free_memory_of_gpu(kdev->kgd, mem);
+err_alloc_mem:
+	*kptr = NULL;
+	down_write(&p->lock);
+	return err;
+}
+
+/* kfd_process_reserve_ib_mem - Reserve memory inside the process for IB usage
+ *	The memory reserved is for KFD to submit IB to AMDGPU from kernel.
+ *	If the memory is reserved successfully, ib_kaddr_assigned will have
+ *	the CPU/kernel address. Check ib_kaddr_assigned before accessing the
+ *	memory.
+ */
+static int kfd_process_reserve_ib_mem(struct kfd_process *p)
+{
+	int err = 0;
+	struct kfd_process_device *temp, *pdd = NULL;
+	struct kfd_dev *kdev = NULL;
+	struct qcm_process_device *qpd = NULL;
+	void *kaddr;
+
+	down_write(&p->lock);
+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+				per_device_list) {
+		kdev = pdd->dev;
+		qpd = &pdd->qpd;
+		if (!kdev->ib_size || qpd->ib_kaddr)
+			continue;
+
+		if (qpd->ib_base) { /* is dGPU */
+			err = kfd_process_alloc_gpuvm(p, kdev,
+				qpd->ib_base, kdev->ib_size, pdd->vm,
+				&kaddr, pdd, (uint64_t *)&qpd->ib_kaddr);
+			if (!err)
+				qpd->ib_kaddr = kaddr;
+			else if (qpd->ib_kaddr)
+				err = 0;
+			else
+				err = -ENOMEM;
+		} else {
+			/* FIXME: Support APU */
+			err = -ENOMEM;
+		}
+	}
+
+	up_write(&p->lock);
+	return err;
+}
+
 struct kfd_process *kfd_create_process(struct file *filep)
 {
 	struct kfd_process *process;
@@ -117,6 +231,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
 	up_write(&thread->mm->mmap_sem);
 
 	kfd_process_init_cwsr(process, filep);
+	kfd_process_reserve_ib_mem(process);
 
 	return process;
 }
-- 
2.7.4