From f9e7b93c88de0eec9a2cd386b37e8bfc2c38d31b Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 28 Jul 2017 18:15:46 -0400 Subject: [PATCH 1758/4131] drm/amdkfd: Fix a bug that vmid is released before resetting wavefronts When no HWS is used, vmid is always released after the last queue is destroyed rather than when the process terminates. With the current code, when a process terminates with all queues destroyed and somehow we need to reset wavefronts, dbgdev_wave_reset_wavefronts() will fail because no vmid is bound to this process any more. With this commit, we will reset the wavefronts, if needed, just before releasing the vmid. As part of the change, the wavefronts reset handling is moved to DQM from PQM, resulting in clearer logic. Change-Id: Ib72b7dc1d910045130928a8e20729b884a55b335 Signed-off-by: Yong Zhao Conflicts[4.12]: drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c --- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 24 +++++++++++++++++++++- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 11 +++++----- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 1 - .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 15 -------------- 4 files changed, 29 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 77cabd1..8dbbbeb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -421,12 +421,26 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_HIQ_TIMEOUT, q->pipe, q->queue); + if (retval == -ETIME) + qpd->reset_wavefronts = true; mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); list_del(&q->list); - if (list_empty(&qpd->queues_list)) + if (list_empty(&qpd->queues_list)) { + if (qpd->reset_wavefronts) { + pr_warn("Resetting wave fronts (nocpsch) on dev %p\n", + dqm->dev); + /* dbgdev_wave_reset_wavefronts has to be called before + * deallocate_vmid(), i.e. when vmid is still in use. + */ + dbgdev_wave_reset_wavefronts(dqm->dev, + qpd->pqm->process); + qpd->reset_wavefronts = false; + } + deallocate_vmid(dqm, qpd, q); + } if (q->properties.is_active) dqm->queue_count--; @@ -1307,6 +1321,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, dqm->queue_count--; retval = execute_queues_cpsch(dqm, false, false); + if (retval == -ETIME) + qpd->reset_wavefronts = true; mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj); @@ -1533,6 +1549,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, retval = execute_queues_cpsch(dqm, true, true); + if (retval || qpd->reset_wavefronts) { + pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); + dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); + qpd->reset_wavefronts = false; + } + /* lastly, free mqd resources */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { mqd = dqm->ops.get_mqd_manager(dqm, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a711c86..9e28454 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -540,6 +540,12 @@ struct qcm_process_device { unsigned int vmid; bool is_debug; unsigned int evicted; /* eviction counter, 0=active */ + + /* This flag tells if we should reset all wavefronts on + * process termination + */ + bool reset_wavefronts; + /* * All the memory management data should be here too */ @@ -641,11 +647,6 @@ struct kfd_process_device { /* GPUVM allocations storage */ struct idr alloc_idr; - /* This flag tells if we should reset all - * wavefronts on process termination - */ - bool reset_wavefronts; - /* Flag used to tell the pdd has dequeued from the dqm. * This is used to prevent dev->dqm->ops.process_termination() from * being called twice when it is already called in IOMMU callback diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 39d9e6d2..baf1f75 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -705,7 +705,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, pdd->qpd.dqm = dev->dqm; pdd->qpd.pqm = &p->pqm; pdd->qpd.evicted = 0; - pdd->reset_wavefronts = false; pdd->process = p; pdd->bound = PDD_UNBOUND; pdd->already_dequeued = false; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 920fe18..3601b70 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -66,7 +66,6 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; - struct kfd_process *p = pdd->process; int retval; if (pdd->already_dequeued) @@ -74,16 +73,6 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) retval = dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd); pdd->already_dequeued = true; - /* Checking pdd->reset_wavefronts may not be needed, because - * if reset_wavefronts was set to true before, which means unmapping - * failed, process_termination should fail too until we reset - * wavefronts. Now we put the check there to be safe. - */ - if (retval || pdd->reset_wavefronts) { - pr_warn("Resetting wave fronts on dev %p\n", dev); - dbgdev_wave_reset_wavefronts(dev, p); - pdd->reset_wavefronts = false; - } } void kfd_process_dequeue_from_all_devices(struct kfd_process *p) @@ -337,10 +326,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) kfree(pqn->q->properties.cu_mask); pqn->q->properties.cu_mask = NULL; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); - if (retval != 0) { - if (retval == -ETIME) - pdd->reset_wavefronts = true; - } uninit_queue(pqn->q); } -- 2.7.4