aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch95
1 files changed, 95 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch
new file mode 100644
index 00000000..810287e1
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch
@@ -0,0 +1,95 @@
+From 5186058f12d83bfac377975bb18adeb2c0f28268 Mon Sep 17 00:00:00 2001
+From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Date: Fri, 30 Aug 2019 10:31:18 -0400
+Subject: [PATCH 3675/4256] drm/amdgpu: Fix bugs in amdgpu_device_gpu_recover
+ in XGMI case.
+
+Issue 1:
+In XGMI case amdgpu_device_lock_adev for other devices in hive
+was called to late, after access to their repsective schedulers.
+So relocate the lock to the begining of accessing the other devs.
+
+Issue 2:
+Using amdgpu_device_ip_need_full_reset to switch the device list from
+all devices in hive to the single 'master' device who owns this reset
+call is wrong because when stopping schedulers we iterate all the devices
+in hive but when restarting we will only reactivate the 'master' device.
+Also, in case amdgpu_device_pre_asic_reset conlcudes that full reset IS
+needed we then have to stop schedulers for all devices in hive and not
+only the 'master' but with amdgpu_device_ip_need_full_reset we
+already missed the opprotunity do to so. So just remove this logic and
+always stop and start all schedulers for all devices in hive.
+
+Also minor cleanup and print fix.
+
+v4: Minor coding style fix.
+
+Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Acked-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++------------
+ 1 file changed, 10 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index f10748306462..648852649fc2 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -3808,15 +3808,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ device_list_handle = &device_list;
+ }
+
+- /*
+- * Mark these ASICs to be reseted as untracked first
+- * And add them back after reset completed
+- */
+- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head)
+- amdgpu_unregister_gpu_instance(tmp_adev);
+-
+ /* block all schedulers and reset given job's ring */
+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
++ if (tmp_adev != adev)
++ amdgpu_device_lock_adev(tmp_adev, false);
++ /*
++ * Mark these ASICs to be reseted as untracked first
++ * And add them back after reset completed
++ */
++ amdgpu_unregister_gpu_instance(tmp_adev);
++
+ /* disable ras on ALL IPs */
+ if (amdgpu_device_ip_need_full_reset(tmp_adev))
+ amdgpu_ras_suspend(tmp_adev);
+@@ -3842,9 +3843,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ dma_fence_is_signaled(job->base.s_fence->parent))
+ job_signaled = true;
+
+- if (!amdgpu_device_ip_need_full_reset(adev))
+- device_list_handle = &device_list;
+-
+ if (job_signaled) {
+ dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
+ goto skip_hw_reset;
+@@ -3866,7 +3864,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ if (tmp_adev == adev)
+ continue;
+
+- amdgpu_device_lock_adev(tmp_adev, false);
+ r = amdgpu_device_pre_asic_reset(tmp_adev,
+ NULL,
+ &need_full_reset);
+@@ -3915,10 +3912,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+
+ if (r) {
+ /* bad news, how to tell it to userspace ? */
+- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
++ dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ } else {
+- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
++ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ }
+
+ amdgpu_device_unlock_adev(tmp_adev);
+--
+2.17.1
+