diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch new file mode 100644 index 00000000..810287e1 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3675-drm-amdgpu-Fix-bugs-in-amdgpu_device_gpu_recover-in-.patch @@ -0,0 +1,95 @@ +From 5186058f12d83bfac377975bb18adeb2c0f28268 Mon Sep 17 00:00:00 2001 +From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Date: Fri, 30 Aug 2019 10:31:18 -0400 +Subject: [PATCH 3675/4256] drm/amdgpu: Fix bugs in amdgpu_device_gpu_recover + in XGMI case. + +Issue 1: +In XGMI case amdgpu_device_lock_adev for other devices in hive +was called to late, after access to their repsective schedulers. +So relocate the lock to the begining of accessing the other devs. + +Issue 2: +Using amdgpu_device_ip_need_full_reset to switch the device list from +all devices in hive to the single 'master' device who owns this reset +call is wrong because when stopping schedulers we iterate all the devices +in hive but when restarting we will only reactivate the 'master' device. +Also, in case amdgpu_device_pre_asic_reset conlcudes that full reset IS +needed we then have to stop schedulers for all devices in hive and not +only the 'master' but with amdgpu_device_ip_need_full_reset we +already missed the opprotunity do to so. So just remove this logic and +always stop and start all schedulers for all devices in hive. + +Also minor cleanup and print fix. + +v4: Minor coding style fix. + +Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> +Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++------------ + 1 file changed, 10 insertions(+), 13 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index f10748306462..648852649fc2 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -3808,15 +3808,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + device_list_handle = &device_list; + } + +- /* +- * Mark these ASICs to be reseted as untracked first +- * And add them back after reset completed +- */ +- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) +- amdgpu_unregister_gpu_instance(tmp_adev); +- + /* block all schedulers and reset given job's ring */ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { ++ if (tmp_adev != adev) ++ amdgpu_device_lock_adev(tmp_adev, false); ++ /* ++ * Mark these ASICs to be reseted as untracked first ++ * And add them back after reset completed ++ */ ++ amdgpu_unregister_gpu_instance(tmp_adev); ++ + /* disable ras on ALL IPs */ + if (amdgpu_device_ip_need_full_reset(tmp_adev)) + amdgpu_ras_suspend(tmp_adev); +@@ -3842,9 +3843,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + dma_fence_is_signaled(job->base.s_fence->parent)) + job_signaled = true; + +- if (!amdgpu_device_ip_need_full_reset(adev)) +- device_list_handle = &device_list; +- + if (job_signaled) { + dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); + goto skip_hw_reset; +@@ -3866,7 +3864,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + if (tmp_adev == adev) + continue; + +- amdgpu_device_lock_adev(tmp_adev, false); + r = amdgpu_device_pre_asic_reset(tmp_adev, + NULL, + &need_full_reset); +@@ -3915,10 +3912,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + + if (r) { + /* bad news, how to tell it to userspace ? */ +- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); ++ dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); + amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); + } else { +- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); ++ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); + } + + amdgpu_device_unlock_adev(tmp_adev); +-- +2.17.1 + |