1 files changed, 314 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch
new file mode 100644
index 00000000..2a841667
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch
@@ -0,0 +1,314 @@
+From bfdaf131a7afc8226419aa8f5176648833329105 Mon Sep 17 00:00:00 2001
+From: Joseph Greathouse <Joseph.Greathouse@amd.com>
+Date: Wed, 18 Sep 2019 14:49:57 -0500
+Subject: [PATCH 3921/4256] Enable over-subscription with >1 GWS queue
+
+The current GWS usage model will only allows a single GWS-enabled
+process to be active on the GPU at once. This ensures that a
+barrier-using kernel gets a known amount of GPU hardware, to
+prevent deadlock due to inability to go beyond the GWS barrier.
+
+The HWS watches how many GWS entries are assigned to each process,
+and goes into over-subscription mode when two processes need more
+than the 64 that are available. The current KFD method for working
+with this is to allocate all 64 GWS entries to each GWS-capable
+process.
+
+When more than one GWS-enabled process is in the runlist, we must
+make sure the runlist is in over-subscription mode, so that the
+HWS gets a chained RUN_LIST packet and continues scheduling
+kernels.
+
+Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  1 +
+ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 58 ++++++++++++++++++-
+ .../drm/amd/amdkfd/kfd_device_queue_manager.h |  1 +
+ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  1 +
+ .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c  |  2 +-
+ .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  6 +-
+ drivers/gpu/drm/amd/amdkfd/kfd_priv.h         | 13 +++++
+ drivers/gpu/drm/amd/amdkfd/kfd_process.c      |  1 +
+ 8 files changed, 78 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 3362b4516089..838a8d46ba47 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -219,6 +219,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
+ 	}
+ 
+ 	q_properties->is_interop = false;
++	q_properties->is_gws = false;
+ 	q_properties->queue_percent = args->queue_percentage;
+ 	q_properties->priority = args->queue_priority;
+ 	q_properties->queue_address = args->ring_base_address;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index eb7e1aaf54a4..3aec5046d26d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -504,8 +504,13 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
+ 		deallocate_vmid(dqm, qpd, q);
+ 	}
+ 	qpd->queue_count--;
+-	if (q->properties.is_active)
++	if (q->properties.is_active) {
+ 		dqm->queue_count--;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count--;
++			qpd->mapped_gws_queue = false;
++		}
++	}
+ 
+ 	return retval;
+ }
+@@ -577,6 +582,20 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ 	else if (!q->properties.is_active && prev_active)
+ 		dqm->queue_count--;
+ 
++	if (q->gws && !q->properties.is_gws) {
++		if (q->properties.is_active) {
++			dqm->gws_queue_count++;
++			pdd->qpd.mapped_gws_queue = true;
++		}
++		q->properties.is_gws = true;
++	} else if (!q->gws && q->properties.is_gws) {
++		if (q->properties.is_active) {
++			dqm->gws_queue_count--;
++			pdd->qpd.mapped_gws_queue = false;
++		}
++		q->properties.is_gws = false;
++	}
++
+ 	if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
+ 		retval = map_queues_cpsch(dqm);
+ 	else if (q->properties.is_active &&
+@@ -619,6 +638,10 @@ static int suspend_single_queue(struct device_queue_manager *dqm,
+ 	if (q->properties.is_active) {
+ 		dqm->queue_count--;
+ 		q->properties.is_active = false;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count--;
++			pdd->qpd.mapped_gws_queue = false;
++		}
+ 	}
+ 
+ 	return retval;
+@@ -653,6 +676,10 @@ static int resume_single_queue(struct device_queue_manager *dqm,
+ 	if (QUEUE_IS_ACTIVE(q->properties)) {
+ 		q->properties.is_active = true;
+ 		dqm->queue_count++;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count++;
++			qpd->mapped_gws_queue = true;
++		}
+ 	}
+ 
+ 	return retval;
+@@ -693,6 +720,10 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 			 */
+ 			ret = retval;
+ 		dqm->queue_count--;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count--;
++			qpd->mapped_gws_queue = false;
++		}
+ 	}
+ 
+ out:
+@@ -725,6 +756,10 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
+ 
+ 		q->properties.is_active = false;
+ 		dqm->queue_count--;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count--;
++			qpd->mapped_gws_queue = false;
++		}
+ 	}
+ 	retval = execute_queues_cpsch(dqm,
+ 				qpd->is_debug ?
+@@ -802,6 +837,10 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ 			 */
+ 			ret = retval;
+ 		dqm->queue_count++;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count++;
++			qpd->mapped_gws_queue = true;
++		}
+ 	}
+ 	qpd->evicted = 0;
+ out:
+@@ -846,6 +885,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
+ 
+ 		q->properties.is_active = true;
+ 		dqm->queue_count++;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count++;
++			qpd->mapped_gws_queue = true;
++		}
+ 	}
+ 	retval = execute_queues_cpsch(dqm,
+ 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+@@ -952,6 +995,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
+ 	dqm->queue_count = dqm->next_pipe_to_allocate = 0;
+ 	dqm->sdma_queue_count = 0;
+ 	dqm->xgmi_sdma_queue_count = 0;
++	dqm->gws_queue_count = 0;
+ 	dqm->trap_debug_vmid = 0;
+ 
+ 	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
+@@ -1108,6 +1152,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
+ 	dqm->queue_count = dqm->processes_count = 0;
+ 	dqm->sdma_queue_count = 0;
+ 	dqm->xgmi_sdma_queue_count = 0;
++	dqm->gws_queue_count = 0;
+ 	dqm->active_runlist = false;
+ 	dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
+ 	dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
+@@ -1492,6 +1537,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
+ 				USE_DEFAULT_GRACE_PERIOD);
+ 		if (retval == -ETIME)
+ 			qpd->reset_wavefronts = true;
++		if (q->properties.is_gws) {
++			dqm->gws_queue_count--;
++			qpd->mapped_gws_queue = false;
++		}
+ 	}
+ 
+ 	/*
+@@ -1704,8 +1753,13 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
+ 			deallocate_sdma_queue(dqm, q);
+ 		}
+ 
+-		if (q->properties.is_active)
++		if (q->properties.is_active) {
+ 			dqm->queue_count--;
++			if (q->properties.is_gws) {
++				dqm->gws_queue_count--;
++				qpd->mapped_gws_queue = false;
++			}
++		}
+ 
+ 		dqm->total_queue_count--;
+ 	}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index 54f4fad61359..eed8f950b663 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -182,6 +182,7 @@ struct device_queue_manager {
+ 	unsigned int		queue_count;
+ 	unsigned int		sdma_queue_count;
+ 	unsigned int		xgmi_sdma_queue_count;
++	unsigned int		gws_queue_count;
+ 	unsigned int		total_queue_count;
+ 	unsigned int		next_pipe_to_allocate;
+ 	unsigned int		*allocated_queues;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+index 9ec62435326e..ac031dc09d66 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+@@ -115,6 +115,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
+ 
+ 	prop.queue_size = queue_size;
+ 	prop.is_interop = false;
++	prop.is_gws = false;
+ 	prop.priority = 1;
+ 	prop.queue_percent = 100;
+ 	prop.type = type;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+index c3f39ef4de56..f7d9dac26485 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+@@ -84,7 +84,7 @@ static int pm_map_process_v9(struct packet_manager *pm,
+ 	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+ 	packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
+ 	packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
+-	packet->bitfields14.num_gws = qpd->num_gws;
++	packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0;
+ 	packet->bitfields14.num_oac = qpd->num_oac;
+ 	packet->bitfields14.sdma_enable = 1;
+ 	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+index 08d3b38117b5..43e8e0258188 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+@@ -41,7 +41,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 				unsigned int *rlib_size,
+ 				bool *over_subscription)
+ {
+-	unsigned int process_count, queue_count, compute_queue_count;
++	unsigned int process_count, queue_count, compute_queue_count, gws_queue_count;
+ 	unsigned int map_queue_size;
+ 	unsigned int max_proc_per_quantum = 1;
+ 	struct kfd_dev *dev = pm->dqm->dev;
+@@ -50,6 +50,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 	queue_count = pm->dqm->queue_count;
+ 	compute_queue_count = queue_count - pm->dqm->sdma_queue_count -
+ 				pm->dqm->xgmi_sdma_queue_count;
++	gws_queue_count = pm->dqm->gws_queue_count;
+ 
+ 	/* check if there is over subscription
+ 	 * Note: the arbitration between the number of VMIDs and
+@@ -62,7 +63,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
+ 		max_proc_per_quantum = dev->max_proc_per_quantum;
+ 
+ 	if ((process_count > max_proc_per_quantum) ||
+-	    compute_queue_count > get_queues_num(pm->dqm)) {
++	    compute_queue_count > get_queues_num(pm->dqm) ||
++	    gws_queue_count > 1) {
+ 		*over_subscription = true;
+ 		pr_debug("Over subscribed runlist\n");
+ 	}
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+index 6bf5be992303..9ac50a4eb294 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+@@ -466,6 +466,10 @@ enum KFD_QUEUE_PRIORITY {
+  * @is_active: Defines if the queue is active or not. @is_active and
+  * @is_evicted are protected by the DQM lock.
+  *
++ * @is_gws: Defines if the queue has been updated to be GWS-capable or not.
++ * @is_gws should be protected by the DQM lock, since changing it can yield the
++ * possibility of updating DQM state on number of GWS queues.
++ *
+  * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid
+  * of the queue.
+  *
+@@ -490,6 +494,7 @@ struct queue_properties {
+ 	bool is_suspended;
+ 	bool is_active;
+ 	bool is_new;
++	bool is_gws;
+ 	/* Not relevant for user mode queues in cp scheduling */
+ 	unsigned int vmid;
+ 	/* Relevant only for sdma queues*/
+@@ -628,6 +633,14 @@ struct qcm_process_device {
+ 	 */
+ 	bool reset_wavefronts;
+ 
++	/* This flag tells us if this process has a GWS-capable
++	 * queue that will be mapped into the runlist. It's
++	 * possible to request a GWS BO, but not have the queue
++	 * currently mapped, and this changes how the MAP_PROCESS
++	 * PM4 packet is configured.
++	 */
++	bool mapped_gws_queue;
++
+ 	/*
+ 	 * All the memory management data should be here too
+ 	 */
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+index c14fdf3bda75..d7e057376d8f 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+@@ -824,6 +824,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
+ 	pdd->qpd.dqm = dev->dqm;
+ 	pdd->qpd.pqm = &p->pqm;
+ 	pdd->qpd.evicted = 0;
++	pdd->qpd.mapped_gws_queue = false;
+ 	mutex_init(&pdd->qpd.doorbell_lock);
+ 	pdd->process = p;
+ 	pdd->bound = PDD_UNBOUND;
+-- 
+2.17.1
+