diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch b/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch new file mode 100644 index 00000000..beac02ef --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.19.8/0896-drm-amdgpu-Implement-concurrent-asic-reset-for-XGMI.patch @@ -0,0 +1,117 @@ +From 00eb1f0ed857d15fedd6455086cba0080d5cc824 Mon Sep 17 00:00:00 2001 +From: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Date: Thu, 29 Nov 2018 15:14:27 -0500 +Subject: [PATCH 0896/2940] drm/amdgpu: Implement concurrent asic reset for + XGMI. + +Use per hive wq to concurrently send reset commands to all nodes +in the hive. + +v2: +Switch to system_highpri_wq after dropping dedicated queue. +Fix non XGMI code path KASAN error. +Stop the hive reset for each node loop if there +is a reset failure on any of the nodes. + +Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Acked-by: Alex Deucher <alexander.deucher@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +++++++++++++++++++--- + 2 files changed, 41 insertions(+), 5 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 98b40c96ca13..f84a1a611c56 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -948,7 +948,9 @@ struct amdgpu_device { + bool in_gpu_reset; + struct mutex lock_reset; + struct amdgpu_doorbell_index doorbell_index; ++ + int asic_reset_res; ++ struct work_struct xgmi_reset_work; + }; + + static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 372b2a0131b1..24433648fd2e 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -2405,6 +2405,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) + return amdgpu_device_asic_has_dc_support(adev->asic_type); + } + ++ ++static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) ++{ ++ struct amdgpu_device *adev = ++ container_of(__work, struct amdgpu_device, xgmi_reset_work); ++ ++ adev->asic_reset_res = amdgpu_asic_reset(adev); ++ if (adev->asic_reset_res) ++ DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", ++ adev->asic_reset_res, adev->ddev->unique); ++} ++ ++ + /** + * amdgpu_device_init - initialize the driver + * +@@ -2503,6 +2516,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, + INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, + amdgpu_device_delay_enable_gfx_off); + ++ INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); ++ + adev->gfx.gfx_off_req_count = 1; + adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; + +@@ -3380,10 +3395,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + */ + if (need_full_reset) { + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { +- r = amdgpu_asic_reset(tmp_adev); +- if (r) +- DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", ++ /* For XGMI run all resets in parallel to speed up the process */ ++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { ++ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) ++ r = -EALREADY; ++ } else ++ r = amdgpu_asic_reset(tmp_adev); ++ ++ if (r) { ++ DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s", + r, tmp_adev->ddev->unique); ++ break; ++ } ++ } ++ ++ /* For XGMI wait for all PSP resets to complete before proceed */ ++ if (!r) { ++ list_for_each_entry(tmp_adev, device_list_handle, ++ gmc.xgmi.head) { ++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { ++ flush_work(&tmp_adev->xgmi_reset_work); ++ r = tmp_adev->asic_reset_res; ++ if (r) ++ break; ++ } ++ } + } + } + +@@ -3570,8 +3606,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + if (tmp_adev == adev) + continue; + +- dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique); +- + amdgpu_device_lock_adev(tmp_adev); + r = amdgpu_device_pre_asic_reset(tmp_adev, + NULL, +-- +2.17.1 + |