aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch264
1 files changed, 264 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch
new file mode 100644
index 00000000..6c12980b
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4271-drm-amdkfd-don-t-use-dqm-lock-during-device-reset-su.patch
@@ -0,0 +1,264 @@
+From e4339c1a7be1491127ecfb14b1631197e5fdb28b Mon Sep 17 00:00:00 2001
+From: Philip Yang <Philip.Yang@amd.com>
+Date: Fri, 18 Oct 2019 10:15:21 -0400
+Subject: [PATCH 4271/4736] drm/amdkfd: don't use dqm lock during device
+ reset/suspend/resume
+
+If device reset/suspend/resume failed for some reason, dqm lock is
+hold forever and this causes deadlock. Below is a kernel backtrace when
+application open kfd after suspend/resume failed.
+
+Instead of holding dqm lock in pre_reset and releasing dqm lock in
+post_reset, add dqm->sched_running flag which is modified in
+dqm->ops.start and dqm->ops.stop. The flag doesn't need lock protection
+because write/read are all inside dqm lock.
+
+For HWS case, map_queues_cpsch and unmap_queues_cpsch checks
+sched_running flag before sending the updated runlist.
+
+v2: For no-HWS case, when device is stopped, don't call
+load/destroy_mqd for eviction, restore and create queue, and avoid
+debugfs dump hdqs.
+
+Backtrace of dqm lock deadlock:
+
+[Thu Oct 17 16:43:37 2019] INFO: task rocminfo:3024 blocked for more
+than 120 seconds.
+[Thu Oct 17 16:43:37 2019] Not tainted
+5.0.0-rc1-kfd-compute-rocm-dkms-no-npi-1131 #1
+[Thu Oct 17 16:43:37 2019] "echo 0 >
+/proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[Thu Oct 17 16:43:37 2019] rocminfo D 0 3024 2947
+0x80000000
+[Thu Oct 17 16:43:37 2019] Call Trace:
+[Thu Oct 17 16:43:37 2019] ? __schedule+0x3d9/0x8a0
+[Thu Oct 17 16:43:37 2019] schedule+0x32/0x70
+[Thu Oct 17 16:43:37 2019] schedule_preempt_disabled+0xa/0x10
+[Thu Oct 17 16:43:37 2019] __mutex_lock.isra.9+0x1e3/0x4e0
+[Thu Oct 17 16:43:37 2019] ? __call_srcu+0x264/0x3b0
+[Thu Oct 17 16:43:37 2019] ? process_termination_cpsch+0x24/0x2f0
+[amdgpu]
+[Thu Oct 17 16:43:37 2019] process_termination_cpsch+0x24/0x2f0
+[amdgpu]
+[Thu Oct 17 16:43:37 2019]
+kfd_process_dequeue_from_all_devices+0x42/0x60 [amdgpu]
+[Thu Oct 17 16:43:37 2019] kfd_process_notifier_release+0x1be/0x220
+[amdgpu]
+[Thu Oct 17 16:43:37 2019] __mmu_notifier_release+0x3e/0xc0
+[Thu Oct 17 16:43:37 2019] exit_mmap+0x160/0x1a0
+[Thu Oct 17 16:43:37 2019] ? __handle_mm_fault+0xba3/0x1200
+[Thu Oct 17 16:43:37 2019] ? exit_robust_list+0x5a/0x110
+[Thu Oct 17 16:43:37 2019] mmput+0x4a/0x120
+[Thu Oct 17 16:43:37 2019] do_exit+0x284/0xb20
+[Thu Oct 17 16:43:37 2019] ? handle_mm_fault+0xfa/0x200
+[Thu Oct 17 16:43:37 2019] do_group_exit+0x3a/0xa0
+[Thu Oct 17 16:43:37 2019] __x64_sys_exit_group+0x14/0x20
+[Thu Oct 17 16:43:37 2019] do_syscall_64+0x4f/0x100
+[Thu Oct 17 16:43:37 2019] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Change-Id: Iecaa52a3fa406a8b8f219ae800993f42678ceddd
+Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Signed-off-by: Philip Yang <Philip.Yang@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
+---
+ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 5 --
+ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 47 +++++++++++++++++--
+ .../drm/amd/amdkfd/kfd_device_queue_manager.h | 1 +
+ 3 files changed, 43 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+index ee9b9a6968bd..eb5eeba8792d 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+@@ -744,9 +744,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
+ return 0;
+ kgd2kfd_suspend(kfd);
+
+- /* hold dqm->lock to prevent further execution*/
+- dqm_lock(kfd->dqm);
+-
+ kfd_signal_reset_event(kfd);
+ return 0;
+ }
+@@ -767,8 +764,6 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
+ if (!kfd->init_complete)
+ return 0;
+
+- dqm_unlock(kfd->dqm);
+-
+ ret = kfd_resume(kfd);
+ if (ret)
+ return ret;
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+index 0b63740b4c63..2f0aeb60fe40 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+@@ -369,6 +369,10 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
+ &q->gart_mqd_addr, &q->properties);
+ if (q->properties.is_active) {
++ if (!dqm->sched_running) {
++ WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
++ goto add_queue_to_list;
++ }
+
+ if (WARN(q->process->mm != current->mm,
+ "should only run in user thread"))
+@@ -380,6 +384,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
+ goto out_free_mqd;
+ }
+
++add_queue_to_list:
+ list_add(&q->list, &qpd->queues_list);
+ qpd->queue_count++;
+ if (q->properties.is_active)
+@@ -487,6 +492,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
+
+ deallocate_doorbell(qpd, q);
+
++ if (!dqm->sched_running) {
++ WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
++ return 0;
++ }
++
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+ KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
+ KFD_UNMAP_LATENCY_MS,
+@@ -568,6 +578,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
+ (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+ q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
++
++ if (!dqm->sched_running) {
++ WARN_ONCE(1, "Update non-HWS queue while stopped\n");
++ goto out_unlock;
++ }
++
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+ KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+@@ -719,6 +735,11 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
+ q->properties.is_active = false;
++ dqm->queue_count--;
++
++ if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
++ continue;
++
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
+ KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
+ KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
+@@ -727,7 +748,6 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
+ * maintain a consistent eviction state
+ */
+ ret = retval;
+- dqm->queue_count--;
+ if (q->properties.is_gws) {
+ dqm->gws_queue_count--;
+ qpd->mapped_gws_queue = false;
+@@ -837,6 +857,11 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+ q->properties.type)];
+ q->properties.is_active = true;
++ dqm->queue_count++;
++
++ if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
++ continue;
++
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
+ q->queue, &q->properties, mm);
+ if (retval && !ret)
+@@ -844,7 +869,6 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
+ * maintain a consistent eviction state
+ */
+ ret = retval;
+- dqm->queue_count++;
+ if (q->properties.is_gws) {
+ dqm->gws_queue_count++;
+ qpd->mapped_gws_queue = true;
+@@ -1042,7 +1066,8 @@ static int start_nocpsch(struct device_queue_manager *dqm)
+
+ if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
+ return pm_init(&dqm->packets, dqm);
+-
++ dqm->sched_running = true;
++
+ return 0;
+ }
+
+@@ -1050,7 +1075,8 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
+ {
+ if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
+ pm_uninit(&dqm->packets);
+-
++ dqm->sched_running = false;
++
+ return 0;
+ }
+
+@@ -1206,6 +1232,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
+ dqm_lock(dqm);
+ /* clear hang status when driver try to start the hw scheduler */
+ dqm->is_hws_hang = false;
++ dqm->sched_running = true;
+ execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
+ USE_DEFAULT_GRACE_PERIOD);
+ dqm_unlock(dqm);
+@@ -1223,6 +1250,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
+ dqm_lock(dqm);
+ unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
+ USE_DEFAULT_GRACE_PERIOD);
++ dqm->sched_running = false;
+ dqm_unlock(dqm);
+
+ kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
+@@ -1413,9 +1441,10 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
+ {
+ int retval;
+
++ if (!dqm->sched_running)
++ return 0;
+ if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
+ return 0;
+-
+ if (dqm->active_runlist)
+ return 0;
+
+@@ -1438,6 +1467,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
+ {
+ int retval = 0;
+
++ if (!dqm->sched_running)
++ return 0;
+ if (dqm->is_hws_hang)
+ return -EIO;
+ if (!dqm->active_runlist)
+@@ -2375,6 +2406,12 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
+ int pipe, queue;
+ int r = 0;
+
++ if (!dqm->sched_running) {
++ seq_printf(m, " Device is stopped\n");
++
++ return 0;
++ }
++
+ r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
+ KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
+ if (!r) {
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+index 48e3b89e27c3..a5e045206fb7 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+@@ -205,6 +205,7 @@ struct device_queue_manager {
+ struct work_struct hw_exception_work;
+ struct kfd_mem_obj hiq_sdma_mqd;
+ uint32_t wait_times;
++ bool sched_running;
+ };
+
+ void device_queue_manager_init_cik(
+--
+2.17.1
+