diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/1852-drm-amdgpu-Avoid-HW-reset-if-guilty-job-already-sign.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/1852-drm-amdgpu-Avoid-HW-reset-if-guilty-job-already-sign.patch | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/1852-drm-amdgpu-Avoid-HW-reset-if-guilty-job-already-sign.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/1852-drm-amdgpu-Avoid-HW-reset-if-guilty-job-already-sign.patch new file mode 100644 index 00000000..bdcc0e0d --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/1852-drm-amdgpu-Avoid-HW-reset-if-guilty-job-already-sign.patch @@ -0,0 +1,258 @@ +From 2ba91eda1578aff807b1bba468381220fa766c71 Mon Sep 17 00:00:00 2001 +From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Date: Wed, 3 Apr 2019 16:58:51 -0400 +Subject: [PATCH 1852/2940] drm/amdgpu: Avoid HW reset if guilty job already + signaled. +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Also reject TDRs if another one already running. + +v2: +Stop all schedulers across device and entire XGMI hive before +force signaling HW fences. +Avoid passing job_signaled to helper fnctions to keep all the decision +making about skipping HW reset in one place. + +v3: +Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced +against it's decrement in drm_sched_stop in non HW reset case. +v4: rebase +v5: Revert v3 as we do it now in sceduler code. + +Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Reviewed-by: Christian König <christian.koenig@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 ++++++++++++++------- + 1 file changed, 95 insertions(+), 48 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 9b9744023785..0eeb3c0cf911 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -3387,8 +3387,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + if (!ring || !ring->sched.thread) + continue; + +- drm_sched_stop(&ring->sched, &job->base); +- + /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ + amdgpu_fence_driver_force_completion(ring); + } +@@ -3396,6 +3394,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + if(job) + drm_sched_increase_karma(&job->base); + ++ /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ + if (!amdgpu_sriov_vf(adev)) { + + if (!need_full_reset) +@@ -3533,37 +3532,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + return r; + } + +-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev) ++static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) + { +- int i; +- +- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { +- struct amdgpu_ring *ring = adev->rings[i]; +- +- if (!ring || !ring->sched.thread) +- continue; +- +- if (!adev->asic_reset_res) +- drm_sched_resubmit_jobs(&ring->sched); ++ if (trylock) { ++ if (!mutex_trylock(&adev->lock_reset)) ++ return false; ++ } else ++ mutex_lock(&adev->lock_reset); + +- drm_sched_start(&ring->sched, !adev->asic_reset_res); +- } +- +- if (!amdgpu_device_has_dc_support(adev)) { +- drm_helper_resume_force_mode(adev->ddev); +- } +- +- adev->asic_reset_res = 0; +-} +- +-static void amdgpu_device_lock_adev(struct amdgpu_device *adev) +-{ +- mutex_lock(&adev->lock_reset); + atomic_inc(&adev->gpu_reset_counter); + adev->in_gpu_reset = 1; + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); ++ ++ return true; + } + + static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) +@@ -3591,40 +3574,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) + int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job) + { +- int r; ++ struct list_head device_list, *device_list_handle = NULL; ++ bool need_full_reset, job_signaled; + struct amdgpu_hive_info *hive = NULL; +- bool need_full_reset = false; + struct amdgpu_device *tmp_adev = NULL; +- struct list_head device_list, *device_list_handle = NULL; ++ int i, r = 0; + ++ need_full_reset = job_signaled = false; + INIT_LIST_HEAD(&device_list); + + dev_info(adev->dev, "GPU reset begin!\n"); + ++ hive = amdgpu_get_xgmi_hive(adev, false); ++ + /* +- * In case of XGMI hive disallow concurrent resets to be triggered +- * by different nodes. No point also since the one node already executing +- * reset will also reset all the other nodes in the hive. ++ * Here we trylock to avoid chain of resets executing from ++ * either trigger by jobs on different adevs in XGMI hive or jobs on ++ * different schedulers for same device while this TO handler is running. ++ * We always reset all schedulers for device and all devices for XGMI ++ * hive so that should take care of them too. + */ +- hive = amdgpu_get_xgmi_hive(adev, 0); +- if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && +- !mutex_trylock(&hive->reset_lock)) ++ ++ if (hive && !mutex_trylock(&hive->reset_lock)) { ++ DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", ++ job->base.id, hive->hive_id); + return 0; ++ } + + /* Start with adev pre asic reset first for soft reset check.*/ +- amdgpu_device_lock_adev(adev); +- r = amdgpu_device_pre_asic_reset(adev, +- job, +- &need_full_reset); +- if (r) { +- /*TODO Should we stop ?*/ +- DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", +- r, adev->ddev->unique); +- adev->asic_reset_res = r; ++ if (!amdgpu_device_lock_adev(adev, !hive)) { ++ DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", ++ job->base.id); ++ return 0; + } + + /* Build list of devices to reset */ +- if (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) { ++ if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) { + amdgpu_device_unlock_adev(adev); + return -ENODEV; +@@ -3641,13 +3626,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + device_list_handle = &device_list; + } + ++ /* block all schedulers and reset given job's ring */ ++ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { ++ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { ++ struct amdgpu_ring *ring = tmp_adev->rings[i]; ++ ++ if (!ring || !ring->sched.thread) ++ continue; ++ ++ drm_sched_stop(&ring->sched, &job->base); ++ } ++ } ++ ++ ++ /* ++ * Must check guilty signal here since after this point all old ++ * HW fences are force signaled. ++ * ++ * job->base holds a reference to parent fence ++ */ ++ if (job && job->base.s_fence->parent && ++ dma_fence_is_signaled(job->base.s_fence->parent)) ++ job_signaled = true; ++ ++ if (!amdgpu_device_ip_need_full_reset(adev)) ++ device_list_handle = &device_list; ++ ++ if (job_signaled) { ++ dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); ++ goto skip_hw_reset; ++ } ++ ++ ++ /* Guilty job will be freed after this*/ ++ r = amdgpu_device_pre_asic_reset(adev, ++ job, ++ &need_full_reset); ++ if (r) { ++ /*TODO Should we stop ?*/ ++ DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", ++ r, adev->ddev->unique); ++ adev->asic_reset_res = r; ++ } ++ + retry: /* Rest of adevs pre asic reset from XGMI hive. */ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + + if (tmp_adev == adev) + continue; + +- amdgpu_device_lock_adev(tmp_adev); ++ amdgpu_device_lock_adev(tmp_adev, false); + r = amdgpu_device_pre_asic_reset(tmp_adev, + NULL, + &need_full_reset); +@@ -3671,9 +3699,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + goto retry; + } + ++skip_hw_reset: ++ + /* Post ASIC reset for all devs .*/ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { +- amdgpu_device_post_asic_reset(tmp_adev); ++ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { ++ struct amdgpu_ring *ring = tmp_adev->rings[i]; ++ ++ if (!ring || !ring->sched.thread) ++ continue; ++ ++ /* No point to resubmit jobs if we didn't HW reset*/ ++ if (!tmp_adev->asic_reset_res && !job_signaled) ++ drm_sched_resubmit_jobs(&ring->sched); ++ ++ drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); ++ } ++ ++ if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { ++ drm_helper_resume_force_mode(tmp_adev->ddev); ++ } ++ ++ tmp_adev->asic_reset_res = 0; + + if (r) { + /* bad news, how to tell it to userspace ? */ +@@ -3686,7 +3733,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + amdgpu_device_unlock_adev(tmp_adev); + } + +- if (hive && adev->gmc.xgmi.num_physical_nodes > 1) ++ if (hive) + mutex_unlock(&hive->reset_lock); + + if (r) +-- +2.17.1 + |