aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch182
1 files changed, 182 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch
new file mode 100644
index 00000000..b549c229
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/4662-drm-amdgpu-add-concurrent-baco-reset-support-for-XGM.patch
@@ -0,0 +1,182 @@
+From 2d2a5a052ea8e4ada3cfabafd23a9e1b896a23ee Mon Sep 17 00:00:00 2001
+From: Le Ma <le.ma@amd.com>
+Date: Tue, 26 Nov 2019 22:12:31 +0800
+Subject: [PATCH 4662/4736] drm/amdgpu: add concurrent baco reset support for
+ XGMI
+
+Currently each XGMI node reset wq does not run in parrallel if bound to same
+cpu. Make change to bound the xgmi_reset_work item to different cpus.
+
+XGMI requires all nodes enter into baco within very close proximity before
+any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
+baco respectively.
+
+To use baco for XGMI, PMFW supported for baco on XGMI needs to be involved.
+
+The case that PSP reset and baco reset coexist within an XGMI hive never exist
+and is not in the consideration.
+
+v2: define use_baco flag to simplify the code for xgmi baco sequence
+
+Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
+Signed-off-by: Le Ma <le.ma@amd.com>
+Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 82 ++++++++++++++++++----
+ 2 files changed, 72 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 4eddee90553b..566ae8bf2ba7 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -1040,6 +1040,8 @@ struct amdgpu_device {
+
+ bool pm_sysfs_en;
+ bool ucode_sysfs_en;
++
++ bool in_baco;
+ };
+
+ static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+index 863590e169ac..2ca9d556c084 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+@@ -2663,7 +2663,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
+ struct amdgpu_device *adev =
+ container_of(__work, struct amdgpu_device, xgmi_reset_work);
+
+- adev->asic_reset_res = amdgpu_asic_reset(adev);
++ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
++ adev->asic_reset_res = (adev->in_baco == false) ?
++ amdgpu_device_baco_enter(adev->ddev) :
++ amdgpu_device_baco_exit(adev->ddev);
++ else
++ adev->asic_reset_res = amdgpu_asic_reset(adev);
++
+ if (adev->asic_reset_res)
+ DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
+ adev->asic_reset_res, adev->ddev->unique);
+@@ -3795,13 +3801,18 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+ return r;
+ }
+
+-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
++static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
++ struct amdgpu_hive_info *hive,
+ struct list_head *device_list_handle,
+ bool *need_full_reset_arg)
+ {
+ struct amdgpu_device *tmp_adev = NULL;
+ bool need_full_reset = *need_full_reset_arg, vram_lost = false;
+ int r = 0;
++ int cpu = smp_processor_id();
++ bool use_baco =
++ (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
++ true : false;
+
+ /*
+ * ASIC reset has to be done on all HGMI hive nodes ASAP
+@@ -3809,21 +3820,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ */
+ if (need_full_reset) {
+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+- /* For XGMI run all resets in parallel to speed up the process */
++ /*
++ * For XGMI run all resets in parallel to speed up the
++ * process by scheduling the highpri wq on different
++ * cpus. For XGMI with baco reset, all nodes must enter
++ * baco within close proximity before anyone exit.
++ */
+ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+- if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
++ if (!queue_work_on(cpu, system_highpri_wq,
++ &tmp_adev->xgmi_reset_work))
+ r = -EALREADY;
++ cpu = cpumask_next(cpu, cpu_online_mask);
+ } else
+ r = amdgpu_asic_reset(tmp_adev);
+-
+- if (r) {
+- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+- r, tmp_adev->ddev->unique);
++ if (r)
+ break;
+- }
+ }
+
+- /* For XGMI wait for all PSP resets to complete before proceed */
++ /* For XGMI wait for all work to complete before proceed */
+ if (!r) {
+ list_for_each_entry(tmp_adev, device_list_handle,
+ gmc.xgmi.head) {
+@@ -3832,11 +3846,54 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+ r = tmp_adev->asic_reset_res;
+ if (r)
+ break;
++ if (use_baco)
++ tmp_adev->in_baco = true;
+ }
+ }
+ }
+- }
+
++ /*
++ * For XGMI with baco reset, need exit baco phase by scheduling
++ * xgmi_reset_work one more time. PSP reset and sGPU skips this
++ * phase. Not assume the situation that PSP reset and baco reset
++ * coexist within an XGMI hive.
++ */
++
++ if (!r && use_baco) {
++ cpu = smp_processor_id();
++ list_for_each_entry(tmp_adev, device_list_handle,
++ gmc.xgmi.head) {
++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
++ if (!queue_work_on(cpu,
++ system_highpri_wq,
++ &tmp_adev->xgmi_reset_work))
++ r = -EALREADY;
++ if (r)
++ break;
++ cpu = cpumask_next(cpu, cpu_online_mask);
++ }
++ }
++ }
++
++ if (!r && use_baco) {
++ list_for_each_entry(tmp_adev, device_list_handle,
++ gmc.xgmi.head) {
++ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
++ flush_work(&tmp_adev->xgmi_reset_work);
++ r = tmp_adev->asic_reset_res;
++ if (r)
++ break;
++ tmp_adev->in_baco = false;
++ }
++ }
++ }
++
++ if (r) {
++ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
++ r, tmp_adev->ddev->unique);
++ goto end;
++ }
++ }
+
+ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ if (need_full_reset) {
+@@ -4121,7 +4178,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
+ if (r)
+ adev->asic_reset_res = r;
+ } else {
+- r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
++ r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
++ &need_full_reset);
+ if (r && r == -EAGAIN)
+ goto retry;
+ }
+--
+2.17.1
+