From bfdaf131a7afc8226419aa8f5176648833329105 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Wed, 18 Sep 2019 14:49:57 -0500
Subject: [PATCH 3921/4256] Enable over-subscription with >1 GWS queue

The current GWS usage model will only allows a single GWS-enabled
process to be active on the GPU at once. This ensures that a
barrier-using kernel gets a known amount of GPU hardware, to
prevent deadlock due to inability to go beyond the GWS barrier.

The HWS watches how many GWS entries are assigned to each process,
and goes into over-subscription mode when two processes need more
than the 64 that are available. The current KFD method for working
with this is to allocate all 64 GWS entries to each GWS-capable
process.

When more than one GWS-enabled process is in the runlist, we must
make sure the runlist is in over-subscription mode, so that the
HWS gets a chained RUN_LIST packet and continues scheduling
kernels.

Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  1 +
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 58 ++++++++++++++++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  1 +
 .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c  |  2 +-
 .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  6 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 13 +++++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  1 +
 8 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3362b4516089..838a8d46ba47 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -219,6 +219,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 	}
 
 	q_properties->is_interop = false;
+	q_properties->is_gws = false;
 	q_properties->queue_percent = args->queue_percentage;
 	q_properties->priority = args->queue_priority;
 	q_properties->queue_address = args->ring_base_address;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index eb7e1aaf54a4..3aec5046d26d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -504,8 +504,13 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 		deallocate_vmid(dqm, qpd, q);
 	}
 	qpd->queue_count--;
-	if (q->properties.is_active)
+	if (q->properties.is_active) {
 		dqm->queue_count--;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count--;
+			qpd->mapped_gws_queue = false;
+		}
+	}
 
 	return retval;
 }
@@ -577,6 +582,20 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 	else if (!q->properties.is_active && prev_active)
 		dqm->queue_count--;
 
+	if (q->gws && !q->properties.is_gws) {
+		if (q->properties.is_active) {
+			dqm->gws_queue_count++;
+			pdd->qpd.mapped_gws_queue = true;
+		}
+		q->properties.is_gws = true;
+	} else if (!q->gws && q->properties.is_gws) {
+		if (q->properties.is_active) {
+			dqm->gws_queue_count--;
+			pdd->qpd.mapped_gws_queue = false;
+		}
+		q->properties.is_gws = false;
+	}
+
 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
 		retval = map_queues_cpsch(dqm);
 	else if (q->properties.is_active &&
@@ -619,6 +638,10 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
 	if (q->properties.is_active) {
 		dqm->queue_count--;
 		q->properties.is_active = false;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count--;
+			pdd->qpd.mapped_gws_queue = false;
+		}
 	}
 
 	return retval;
@@ -653,6 +676,10 @@ static int resume_single_queue(struct device_queue_manager *dqm,
 	if (QUEUE_IS_ACTIVE(q->properties)) {
 		q->properties.is_active = true;
 		dqm->queue_count++;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count++;
+			qpd->mapped_gws_queue = true;
+		}
 	}
 
 	return retval;
@@ -693,6 +720,10 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 			 */
 			ret = retval;
 		dqm->queue_count--;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count--;
+			qpd->mapped_gws_queue = false;
+		}
 	}
 
 out:
@@ -725,6 +756,10 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 
 		q->properties.is_active = false;
 		dqm->queue_count--;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count--;
+			qpd->mapped_gws_queue = false;
+		}
 	}
 	retval = execute_queues_cpsch(dqm,
 				qpd->is_debug ?
@@ -802,6 +837,10 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 			 */
 			ret = retval;
 		dqm->queue_count++;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count++;
+			qpd->mapped_gws_queue = true;
+		}
 	}
 	qpd->evicted = 0;
 out:
@@ -846,6 +885,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 
 		q->properties.is_active = true;
 		dqm->queue_count++;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count++;
+			qpd->mapped_gws_queue = true;
+		}
 	}
 	retval = execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
@@ -952,6 +995,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 	dqm->queue_count = dqm->next_pipe_to_allocate = 0;
 	dqm->sdma_queue_count = 0;
 	dqm->xgmi_sdma_queue_count = 0;
+	dqm->gws_queue_count = 0;
 	dqm->trap_debug_vmid = 0;
 
 	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
@@ -1108,6 +1152,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	dqm->queue_count = dqm->processes_count = 0;
 	dqm->sdma_queue_count = 0;
 	dqm->xgmi_sdma_queue_count = 0;
+	dqm->gws_queue_count = 0;
 	dqm->active_runlist = false;
 	dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
 	dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
@@ -1492,6 +1537,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				USE_DEFAULT_GRACE_PERIOD);
 		if (retval == -ETIME)
 			qpd->reset_wavefronts = true;
+		if (q->properties.is_gws) {
+			dqm->gws_queue_count--;
+			qpd->mapped_gws_queue = false;
+		}
 	}
 
 	/*
@@ -1704,8 +1753,13 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 			deallocate_sdma_queue(dqm, q);
 		}
 
-		if (q->properties.is_active)
+		if (q->properties.is_active) {
 			dqm->queue_count--;
+			if (q->properties.is_gws) {
+				dqm->gws_queue_count--;
+				qpd->mapped_gws_queue = false;
+			}
+		}
 
 		dqm->total_queue_count--;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 54f4fad61359..eed8f950b663 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -182,6 +182,7 @@ struct device_queue_manager {
 	unsigned int		queue_count;
 	unsigned int		sdma_queue_count;
 	unsigned int		xgmi_sdma_queue_count;
+	unsigned int		gws_queue_count;
 	unsigned int		total_queue_count;
 	unsigned int		next_pipe_to_allocate;
 	unsigned int		*allocated_queues;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 9ec62435326e..ac031dc09d66 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -115,6 +115,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 
 	prop.queue_size = queue_size;
 	prop.is_interop = false;
+	prop.is_gws = false;
 	prop.priority = 1;
 	prop.queue_percent = 100;
 	prop.type = type;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index c3f39ef4de56..f7d9dac26485 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -84,7 +84,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
 	packet->bitfields2.pasid = qpd->pqm->process->pasid;
 	packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
 	packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
-	packet->bitfields14.num_gws = qpd->num_gws;
+	packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0;
 	packet->bitfields14.num_oac = qpd->num_oac;
 	packet->bitfields14.sdma_enable = 1;
 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 08d3b38117b5..43e8e0258188 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -41,7 +41,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 				unsigned int *rlib_size,
 				bool *over_subscription)
 {
-	unsigned int process_count, queue_count, compute_queue_count;
+	unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
 	unsigned int map_queue_size;
 	unsigned int max_proc_per_quantum = 1;
 	struct kfd_dev *dev = pm->dqm->dev;
@@ -50,6 +50,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 	queue_count = pm->dqm->queue_count;
 	compute_queue_count = queue_count - pm->dqm->sdma_queue_count -
 				pm->dqm->xgmi_sdma_queue_count;
+	gws_queue_count = pm->dqm->gws_queue_count;
 
 	/* check if there is over subscription
 	 * Note: the arbitration between the number of VMIDs and
@@ -62,7 +63,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 		max_proc_per_quantum = dev->max_proc_per_quantum;
 
 	if ((process_count > max_proc_per_quantum) ||
-	    compute_queue_count > get_queues_num(pm->dqm)) {
+	    compute_queue_count > get_queues_num(pm->dqm) ||
+	    gws_queue_count > 1) {
 		*over_subscription = true;
 		pr_debug("Over subscribed runlist\n");
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6bf5be992303..9ac50a4eb294 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -466,6 +466,10 @@ enum KFD_QUEUE_PRIORITY {
  * @is_active: Defines if the queue is active or not. @is_active and
  * @is_evicted are protected by the DQM lock.
  *
+ * @is_gws: Defines if the queue has been updated to be GWS-capable or not.
+ * @is_gws should be protected by the DQM lock, since changing it can yield the
+ * possibility of updating DQM state on number of GWS queues.
+ *
  * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid
  * of the queue.
  *
@@ -490,6 +494,7 @@ struct queue_properties {
 	bool is_suspended;
 	bool is_active;
 	bool is_new;
+	bool is_gws;
 	/* Not relevant for user mode queues in cp scheduling */
 	unsigned int vmid;
 	/* Relevant only for sdma queues*/
@@ -628,6 +633,14 @@ struct qcm_process_device {
 	 */
 	bool reset_wavefronts;
 
+	/* This flag tells us if this process has a GWS-capable
+	 * queue that will be mapped into the runlist. It's
+	 * possible to request a GWS BO, but not have the queue
+	 * currently mapped, and this changes how the MAP_PROCESS
+	 * PM4 packet is configured.
+	 */
+	bool mapped_gws_queue;
+
 	/*
 	 * All the memory management data should be here too
 	 */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c14fdf3bda75..d7e057376d8f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -824,6 +824,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	pdd->qpd.dqm = dev->dqm;
 	pdd->qpd.pqm = &p->pqm;
 	pdd->qpd.evicted = 0;
+	pdd->qpd.mapped_gws_queue = false;
 	mutex_init(&pdd->qpd.doorbell_lock);
 	pdd->process = p;
 	pdd->bound = PDD_UNBOUND;
-- 
2.17.1