aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch117
1 files changed, 117 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch
new file mode 100644
index 00000000..beac02ef
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch
@@ -0,0 +1,117 @@
+From 00eb1f0ed857d15fedd6455086cba0080d5cc824 Mon Sep 17 00:00:00 2001
+From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Date: Thu, 29 Nov 2018 15:14:27 -0500
+Subject: [PATCH 0896/2940] drm/amdgpu: Implement concurrent asic reset for
+ XGMI.
+
+Use per hive wq to concurrently send reset commands to all nodes
+in the hive.
+
+v2:
+Switch to system_highpri_wq after dropping dedicated queue.
+Fix non XGMI code path KASAN error.
+Stop the hive reset for each node loop if there
+is a reset failure on any of the nodes.
+
+Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Acked-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +++++++++++++++++++---
+ 2 files changed, 41 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 98b40c96ca13..f84a1a611c56 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -948,7 +948,9 @@ struct amdgpu_device {
+ bool in_gpu_reset;
+ struct mutex lock_reset;
+ struct amdgpu_doorbell_index doorbell_index;
++
+ int asic_reset_res;
++ struct work_struct xgmi_reset_work;
+ };
+
+ static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index 372b2a0131b1..24433648fd2e 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -2405,6 +2405,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
+ return amdgpu_device_asic_has_dc_support(adev->asic_type);
+ }
+
++
++static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
++{
++ struct amdgpu_device *adev =
++ container_of(__work, struct amdgpu_device, xgmi_reset_work);
++
++ adev->asic_reset_res = amdgpu_asic_reset(adev);
++ if (adev->asic_reset_res)
++ DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
++ adev->asic_reset_res, adev->ddev->unique);
++}
++
++
+ /**
+ * amdgpu_device_init - initialize the driver
+ *
+@@ -2503,6 +2516,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
+ INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
+ amdgpu_device_delay_enable_gfx_off);
+
++ INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
++
+ adev->gfx.gfx_off_req_count = 1;
+ adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
+
+@@ -3380,10 +3395,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ */
+ if (need_full_reset) {
+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+- r = amdgpu_asic_reset(tmp_adev);
+- if (r)
+- DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
++ /* For XGMI run all resets in parallel to speed up the process */
++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
++ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
++ r = -EALREADY;
++ } else
++ r = amdgpu_asic_reset(tmp_adev);
++
++ if (r) {
++ DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
++ break;
++ }
++ }
++
++ /* For XGMI wait for all PSP resets to complete before proceed */
++ if (!r) {
++ list_for_each_entry(tmp_adev, device_list_handle,
++ gmc.xgmi.head) {
++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
++ flush_work(&tmp_adev->xgmi_reset_work);
++ r = tmp_adev->asic_reset_res;
++ if (r)
++ break;
++ }
++ }
+ }
+ }
+
+@@ -3570,8 +3606,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ if (tmp_adev == adev)
+ continue;
+
+- dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);
+-
+ amdgpu_device_lock_adev(tmp_adev);
+ r = amdgpu_device_pre_asic_reset(tmp_adev,
+ NULL,
+--
+2.17.1
+