diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch | 314 |
1 files changed, 314 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch new file mode 100644 index 00000000..2a841667 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3921-Enable-over-subscription-with-1-GWS-queue.patch @@ -0,0 +1,314 @@ +From bfdaf131a7afc8226419aa8f5176648833329105 Mon Sep 17 00:00:00 2001 +From: Joseph Greathouse <Joseph.Greathouse@amd.com> +Date: Wed, 18 Sep 2019 14:49:57 -0500 +Subject: [PATCH 3921/4256] Enable over-subscription with >1 GWS queue + +The current GWS usage model will only allows a single GWS-enabled +process to be active on the GPU at once. This ensures that a +barrier-using kernel gets a known amount of GPU hardware, to +prevent deadlock due to inability to go beyond the GWS barrier. + +The HWS watches how many GWS entries are assigned to each process, +and goes into over-subscription mode when two processes need more +than the 64 that are available. The current KFD method for working +with this is to allocate all 64 GWS entries to each GWS-capable +process. + +When more than one GWS-enabled process is in the runlist, we must +make sure the runlist is in over-subscription mode, so that the +HWS gets a chained RUN_LIST packet and continues scheduling +kernels. + +Signed-off-by: Joseph Greathouse <Joseph.Greathouse@amd.com> +--- + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 + + .../drm/amd/amdkfd/kfd_device_queue_manager.c | 58 ++++++++++++++++++- + .../drm/amd/amdkfd/kfd_device_queue_manager.h | 1 + + drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 1 + + .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c | 2 +- + .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 6 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 13 +++++ + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 + + 8 files changed, 78 insertions(+), 5 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +index 3362b4516089..838a8d46ba47 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +@@ -219,6 +219,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, + } + + q_properties->is_interop = false; ++ q_properties->is_gws = false; + q_properties->queue_percent = args->queue_percentage; + q_properties->priority = args->queue_priority; + q_properties->queue_address = args->ring_base_address; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +index eb7e1aaf54a4..3aec5046d26d 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +@@ -504,8 +504,13 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, + deallocate_vmid(dqm, qpd, q); + } + qpd->queue_count--; +- if (q->properties.is_active) ++ if (q->properties.is_active) { + dqm->queue_count--; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ qpd->mapped_gws_queue = false; ++ } ++ } + + return retval; + } +@@ -577,6 +582,20 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) + else if (!q->properties.is_active && prev_active) + dqm->queue_count--; + ++ if (q->gws && !q->properties.is_gws) { ++ if (q->properties.is_active) { ++ dqm->gws_queue_count++; ++ pdd->qpd.mapped_gws_queue = true; ++ } ++ q->properties.is_gws = true; ++ } else if (!q->gws && q->properties.is_gws) { ++ if (q->properties.is_active) { ++ dqm->gws_queue_count--; ++ pdd->qpd.mapped_gws_queue = false; ++ } ++ q->properties.is_gws = false; ++ } ++ + if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) + retval = map_queues_cpsch(dqm); + else if (q->properties.is_active && +@@ -619,6 +638,10 @@ static int suspend_single_queue(struct device_queue_manager *dqm, + if (q->properties.is_active) { + dqm->queue_count--; + q->properties.is_active = false; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ pdd->qpd.mapped_gws_queue = false; ++ } + } + + return retval; +@@ -653,6 +676,10 @@ static int resume_single_queue(struct device_queue_manager *dqm, + if (QUEUE_IS_ACTIVE(q->properties)) { + q->properties.is_active = true; + dqm->queue_count++; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count++; ++ qpd->mapped_gws_queue = true; ++ } + } + + return retval; +@@ -693,6 +720,10 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, + */ + ret = retval; + dqm->queue_count--; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ qpd->mapped_gws_queue = false; ++ } + } + + out: +@@ -725,6 +756,10 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, + + q->properties.is_active = false; + dqm->queue_count--; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ qpd->mapped_gws_queue = false; ++ } + } + retval = execute_queues_cpsch(dqm, + qpd->is_debug ? +@@ -802,6 +837,10 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, + */ + ret = retval; + dqm->queue_count++; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count++; ++ qpd->mapped_gws_queue = true; ++ } + } + qpd->evicted = 0; + out: +@@ -846,6 +885,10 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, + + q->properties.is_active = true; + dqm->queue_count++; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count++; ++ qpd->mapped_gws_queue = true; ++ } + } + retval = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, +@@ -952,6 +995,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) + dqm->queue_count = dqm->next_pipe_to_allocate = 0; + dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; ++ dqm->gws_queue_count = 0; + dqm->trap_debug_vmid = 0; + + for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { +@@ -1108,6 +1152,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) + dqm->queue_count = dqm->processes_count = 0; + dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; ++ dqm->gws_queue_count = 0; + dqm->active_runlist = false; + dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm)); + dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm)); +@@ -1492,6 +1537,10 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, + USE_DEFAULT_GRACE_PERIOD); + if (retval == -ETIME) + qpd->reset_wavefronts = true; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ qpd->mapped_gws_queue = false; ++ } + } + + /* +@@ -1704,8 +1753,13 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, + deallocate_sdma_queue(dqm, q); + } + +- if (q->properties.is_active) ++ if (q->properties.is_active) { + dqm->queue_count--; ++ if (q->properties.is_gws) { ++ dqm->gws_queue_count--; ++ qpd->mapped_gws_queue = false; ++ } ++ } + + dqm->total_queue_count--; + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +index 54f4fad61359..eed8f950b663 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +@@ -182,6 +182,7 @@ struct device_queue_manager { + unsigned int queue_count; + unsigned int sdma_queue_count; + unsigned int xgmi_sdma_queue_count; ++ unsigned int gws_queue_count; + unsigned int total_queue_count; + unsigned int next_pipe_to_allocate; + unsigned int *allocated_queues; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +index 9ec62435326e..ac031dc09d66 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +@@ -115,6 +115,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, + + prop.queue_size = queue_size; + prop.is_interop = false; ++ prop.is_gws = false; + prop.priority = 1; + prop.queue_percent = 100; + prop.type = type; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +index c3f39ef4de56..f7d9dac26485 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +@@ -84,7 +84,7 @@ static int pm_map_process_v9(struct packet_manager *pm, + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields14.gds_size = qpd->gds_size & 0x3F; + packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; +- packet->bitfields14.num_gws = qpd->num_gws; ++ packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? qpd->num_gws : 0; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.sdma_enable = 1; + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +index 08d3b38117b5..43e8e0258188 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +@@ -41,7 +41,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + unsigned int *rlib_size, + bool *over_subscription) + { +- unsigned int process_count, queue_count, compute_queue_count; ++ unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; + unsigned int map_queue_size; + unsigned int max_proc_per_quantum = 1; + struct kfd_dev *dev = pm->dqm->dev; +@@ -50,6 +50,7 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + queue_count = pm->dqm->queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count - + pm->dqm->xgmi_sdma_queue_count; ++ gws_queue_count = pm->dqm->gws_queue_count; + + /* check if there is over subscription + * Note: the arbitration between the number of VMIDs and +@@ -62,7 +63,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, + max_proc_per_quantum = dev->max_proc_per_quantum; + + if ((process_count > max_proc_per_quantum) || +- compute_queue_count > get_queues_num(pm->dqm)) { ++ compute_queue_count > get_queues_num(pm->dqm) || ++ gws_queue_count > 1) { + *over_subscription = true; + pr_debug("Over subscribed runlist\n"); + } +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index 6bf5be992303..9ac50a4eb294 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -466,6 +466,10 @@ enum KFD_QUEUE_PRIORITY { + * @is_active: Defines if the queue is active or not. @is_active and + * @is_evicted are protected by the DQM lock. + * ++ * @is_gws: Defines if the queue has been updated to be GWS-capable or not. ++ * @is_gws should be protected by the DQM lock, since changing it can yield the ++ * possibility of updating DQM state on number of GWS queues. ++ * + * @vmid: If the scheduling mode is no cp scheduling the field defines the vmid + * of the queue. + * +@@ -490,6 +494,7 @@ struct queue_properties { + bool is_suspended; + bool is_active; + bool is_new; ++ bool is_gws; + /* Not relevant for user mode queues in cp scheduling */ + unsigned int vmid; + /* Relevant only for sdma queues*/ +@@ -628,6 +633,14 @@ struct qcm_process_device { + */ + bool reset_wavefronts; + ++ /* This flag tells us if this process has a GWS-capable ++ * queue that will be mapped into the runlist. It's ++ * possible to request a GWS BO, but not have the queue ++ * currently mapped, and this changes how the MAP_PROCESS ++ * PM4 packet is configured. ++ */ ++ bool mapped_gws_queue; ++ + /* + * All the memory management data should be here too + */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +index c14fdf3bda75..d7e057376d8f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c +@@ -824,6 +824,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, + pdd->qpd.dqm = dev->dqm; + pdd->qpd.pqm = &p->pqm; + pdd->qpd.evicted = 0; ++ pdd->qpd.mapped_gws_queue = false; + mutex_init(&pdd->qpd.doorbell_lock); + pdd->process = p; + pdd->bound = PDD_UNBOUND; +-- +2.17.1 + |