diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch | 182 |
1 files changed, 182 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch new file mode 100644 index 00000000..b549c229 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch @@ -0,0 +1,182 @@ +From 2d2a5a052ea8e4ada3cfabafd23a9e1b896a23ee Mon Sep 17 00:00:00 2001 +From: Le Ma <le.ma@amd.com> +Date: Tue, 26 Nov 2019 22:12:31 +0800 +Subject: [PATCH 4662/4736] drm/amdgpu: add concurrent baco reset support for + XGMI + +Currently each XGMI node reset wq does not run in parrallel if bound to same +cpu. Make change to bound the xgmi_reset_work item to different cpus. + +XGMI requires all nodes enter into baco within very close proximity before +any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit +baco respectively. + +To use baco for XGMI, PMFW supported for baco on XGMI needs to be involved. + +The case that PSP reset and baco reset coexist within an XGMI hive never exist +and is not in the consideration. + +v2: define use_baco flag to simplify the code for xgmi baco sequence + +Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532 +Signed-off-by: Le Ma <le.ma@amd.com> +Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 82 ++++++++++++++++++---- + 2 files changed, 72 insertions(+), 12 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 4eddee90553b..566ae8bf2ba7 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -1040,6 +1040,8 @@ struct amdgpu_device { + + bool pm_sysfs_en; + bool ucode_sysfs_en; ++ ++ bool in_baco; + }; + + static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 863590e169ac..2ca9d556c084 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -2663,7 +2663,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) + struct amdgpu_device *adev = + container_of(__work, struct amdgpu_device, xgmi_reset_work); + +- adev->asic_reset_res = amdgpu_asic_reset(adev); ++ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ++ adev->asic_reset_res = (adev->in_baco == false) ? ++ amdgpu_device_baco_enter(adev->ddev) : ++ amdgpu_device_baco_exit(adev->ddev); ++ else ++ adev->asic_reset_res = amdgpu_asic_reset(adev); ++ + if (adev->asic_reset_res) + DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", + adev->asic_reset_res, adev->ddev->unique); +@@ -3795,13 +3801,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + return r; + } + +-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ++static int amdgpu_do_asic_reset(struct amdgpu_device *adev, ++ struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset_arg) + { + struct amdgpu_device *tmp_adev = NULL; + bool need_full_reset = *need_full_reset_arg, vram_lost = false; + int r = 0; ++ int cpu = smp_processor_id(); ++ bool use_baco = ++ (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? ++ true : false; + + /* + * ASIC reset has to be done on all HGMI hive nodes ASAP +@@ -3809,21 +3820,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + */ + if (need_full_reset) { + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { +- /* For XGMI run all resets in parallel to speed up the process */ ++ /* ++ * For XGMI run all resets in parallel to speed up the ++ * process by scheduling the highpri wq on different ++ * cpus. For XGMI with baco reset, all nodes must enter ++ * baco within close proximity before anyone exit. ++ */ + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { +- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) ++ if (!queue_work_on(cpu, system_highpri_wq, ++ &tmp_adev->xgmi_reset_work)) + r = -EALREADY; ++ cpu = cpumask_next(cpu, cpu_online_mask); + } else + r = amdgpu_asic_reset(tmp_adev); +- +- if (r) { +- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", +- r, tmp_adev->ddev->unique); ++ if (r) + break; +- } + } + +- /* For XGMI wait for all PSP resets to complete before proceed */ ++ /* For XGMI wait for all work to complete before proceed */ + if (!r) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { +@@ -3832,11 +3846,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + r = tmp_adev->asic_reset_res; + if (r) + break; ++ if (use_baco) ++ tmp_adev->in_baco = true; + } + } + } +- } + ++ /* ++ * For XGMI with baco reset, need exit baco phase by scheduling ++ * xgmi_reset_work one more time. PSP reset and sGPU skips this ++ * phase. Not assume the situation that PSP reset and baco reset ++ * coexist within an XGMI hive. ++ */ ++ ++ if (!r && use_baco) { ++ cpu = smp_processor_id(); ++ list_for_each_entry(tmp_adev, device_list_handle, ++ gmc.xgmi.head) { ++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { ++ if (!queue_work_on(cpu, ++ system_highpri_wq, ++ &tmp_adev->xgmi_reset_work)) ++ r = -EALREADY; ++ if (r) ++ break; ++ cpu = cpumask_next(cpu, cpu_online_mask); ++ } ++ } ++ } ++ ++ if (!r && use_baco) { ++ list_for_each_entry(tmp_adev, device_list_handle, ++ gmc.xgmi.head) { ++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { ++ flush_work(&tmp_adev->xgmi_reset_work); ++ r = tmp_adev->asic_reset_res; ++ if (r) ++ break; ++ tmp_adev->in_baco = false; ++ } ++ } ++ } ++ ++ if (r) { ++ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", ++ r, tmp_adev->ddev->unique); ++ goto end; ++ } ++ } + + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (need_full_reset) { +@@ -4121,7 +4178,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + if (r) + adev->asic_reset_res = r; + } else { +- r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); ++ r = amdgpu_do_asic_reset(adev, hive, device_list_handle, ++ &need_full_reset); + if (r && r == -EAGAIN) + goto retry; + } +-- +2.17.1 + |