diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2040-drm-amdgpu-fix-a-race-in-GPU-reset-with-IB-test-v2.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2040-drm-amdgpu-fix-a-race-in-GPU-reset-with-IB-test-v2.patch | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2040-drm-amdgpu-fix-a-race-in-GPU-reset-with-IB-test-v2.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2040-drm-amdgpu-fix-a-race-in-GPU-reset-with-IB-test-v2.patch new file mode 100644 index 00000000..bc9a0c4f --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/2040-drm-amdgpu-fix-a-race-in-GPU-reset-with-IB-test-v2.patch @@ -0,0 +1,266 @@ +From d303ea8dbbb910cd6b02367b063a2a908cf26e15 Mon Sep 17 00:00:00 2001 +From: Alex Deucher <alexander.deucher@amd.com> +Date: Tue, 28 May 2019 14:17:25 -0500 +Subject: [PATCH 2040/2940] drm/amdgpu: fix a race in GPU reset with IB test + (v2) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Split late_init into two functions, one (do_late_init) which +just does the hw init, and late_init which calls do_late_init +and schedules the IB test work. Call do_late_init in +the GPU reset code to run the init code, but not schedule +the IB test code. The IB test code is called directly +in the gpu reset code so no need to run the IB tests +in a separate work thread. If we do, we end up racing. + +v2: Rework late_init. Pull out the mgpu fan boost and xgmi +pstate code into late_init so they get called in all cases. +rename the late_init worker thread to delayed work since it's +just the IB tests now which can happen later. Schedule the +work at init and resume time. It's not needed at reset time +because the IB tests are called directly. + +Reviewed-by: Christian König <christian.koenig@amd.com> +Cc: Xinhui Pan <xinhui.pan@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +Signed-off-by: Chaudhary Amit Kumar <Chaudharyamit.Kumar@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 116 +++++++++++---------- + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- + 3 files changed, 61 insertions(+), 59 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index e79bcfc1499c..b99fef62357f 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -961,7 +961,7 @@ struct amdgpu_device { + const struct amdgpu_df_funcs *df_funcs; + + /* delayed work_func for deferring clockgating during resume */ +- struct delayed_work late_init_work; ++ struct delayed_work delayed_init_work; + + struct amdgpu_virt virt; + /* firmware VRAM reservation */ +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 1228adcfab49..84a9f0745d77 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -1904,6 +1904,43 @@ static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_power + return 0; + } + ++static int amdgpu_device_enable_mgpu_fan_boost(void) ++{ ++ struct amdgpu_gpu_instance *gpu_ins; ++ struct amdgpu_device *adev; ++ int i, ret = 0; ++ ++ mutex_lock(&mgpu_info.mutex); ++ ++ /* ++ * MGPU fan boost feature should be enabled ++ * only when there are two or more dGPUs in ++ * the system ++ */ ++ if (mgpu_info.num_dgpu < 2) ++ goto out; ++ ++ for (i = 0; i < mgpu_info.num_dgpu; i++) { ++ gpu_ins = &(mgpu_info.gpu_ins[i]); ++ adev = gpu_ins->adev; ++ if (!(adev->flags & AMD_IS_APU) && ++ !gpu_ins->mgpu_fan_enabled && ++ adev->powerplay.pp_funcs && ++ adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { ++ ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); ++ if (ret) ++ break; ++ ++ gpu_ins->mgpu_fan_enabled = 1; ++ } ++ } ++ ++out: ++ mutex_unlock(&mgpu_info.mutex); ++ ++ return ret; ++} ++ + /** + * amdgpu_device_ip_late_init - run late init for hardware IPs + * +@@ -1937,11 +1974,15 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); + amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); + +- queue_delayed_work(system_wq, &adev->late_init_work, +- msecs_to_jiffies(AMDGPU_RESUME_MS)); +- + amdgpu_device_fill_reset_magic(adev); + ++ r = amdgpu_device_enable_mgpu_fan_boost(); ++ if (r) ++ DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); ++ ++ /* set to low pstate by default */ ++ amdgpu_xgmi_set_pstate(adev, 0); ++ + return 0; + } + +@@ -2040,65 +2081,20 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) + return 0; + } + +-static int amdgpu_device_enable_mgpu_fan_boost(void) +-{ +- struct amdgpu_gpu_instance *gpu_ins; +- struct amdgpu_device *adev; +- int i, ret = 0; +- +- mutex_lock(&mgpu_info.mutex); +- +- /* +- * MGPU fan boost feature should be enabled +- * only when there are two or more dGPUs in +- * the system +- */ +- if (mgpu_info.num_dgpu < 2) +- goto out; +- +- for (i = 0; i < mgpu_info.num_dgpu; i++) { +- gpu_ins = &(mgpu_info.gpu_ins[i]); +- adev = gpu_ins->adev; +- if (!(adev->flags & AMD_IS_APU) && +- !gpu_ins->mgpu_fan_enabled && +- adev->powerplay.pp_funcs && +- adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { +- ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); +- if (ret) +- break; +- +- gpu_ins->mgpu_fan_enabled = 1; +- } +- } +- +-out: +- mutex_unlock(&mgpu_info.mutex); +- +- return ret; +-} +- + /** +- * amdgpu_device_ip_late_init_func_handler - work handler for ib test ++ * amdgpu_device_delayed_init_work_handler - work handler for IB tests + * + * @work: work_struct. + */ +-static void amdgpu_device_ip_late_init_func_handler(struct work_struct *work) ++static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) + { + struct amdgpu_device *adev = +- container_of(work, struct amdgpu_device, late_init_work.work); ++ container_of(work, struct amdgpu_device, delayed_init_work.work); + int r; + + r = amdgpu_ib_ring_tests(adev); + if (r) + DRM_ERROR("ib ring test failed (%d).\n", r); +- +- r = amdgpu_device_enable_mgpu_fan_boost(); +- if (r) +- DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); +- +- /*set to low pstate by default */ +- amdgpu_xgmi_set_pstate(adev, 0); +- + } + + static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) +@@ -2571,8 +2567,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, + INIT_LIST_HEAD(&adev->ring_lru_list); + spin_lock_init(&adev->ring_lru_list_lock); + +- INIT_DELAYED_WORK(&adev->late_init_work, +- amdgpu_device_ip_late_init_func_handler); ++ INIT_DELAYED_WORK(&adev->delayed_init_work, ++ amdgpu_device_delayed_init_work_handler); + INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, + amdgpu_device_delay_enable_gfx_off); + +@@ -2806,6 +2802,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, + /* must succeed. */ + amdgpu_ras_resume(adev); + ++ queue_delayed_work(system_wq, &adev->delayed_init_work, ++ msecs_to_jiffies(AMDGPU_RESUME_MS)); ++ + r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); + if (r) { + dev_err(adev->dev, "Could not create pcie_replay_count"); +@@ -2853,7 +2852,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) + adev->firmware.gpu_info_fw = NULL; + } + adev->accel_working = false; +- cancel_delayed_work_sync(&adev->late_init_work); ++ cancel_delayed_work_sync(&adev->delayed_init_work); + /* free i2c buses */ + if (!amdgpu_device_has_dc_support(adev)) + amdgpu_i2c_fini(adev); +@@ -2916,7 +2915,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) + if (fbcon) + amdgpu_fbdev_set_suspend(adev, 1); + +- cancel_delayed_work_sync(&adev->late_init_work); ++ cancel_delayed_work_sync(&adev->delayed_init_work); + + if (!amdgpu_device_has_dc_support(adev)) { + /* turn off display hw */ +@@ -3036,6 +3035,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) + if (r) + return r; + ++ queue_delayed_work(system_wq, &adev->delayed_init_work, ++ msecs_to_jiffies(AMDGPU_RESUME_MS)); ++ + if (!amdgpu_device_has_dc_support(adev)) { + /* pin cursors */ + list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { +@@ -3059,7 +3061,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) + return r; + + /* Make sure IB tests flushed */ +- flush_delayed_work(&adev->late_init_work); ++ flush_delayed_work(&adev->delayed_init_work); + + /* blat the mode back in */ + if (fbcon) { +@@ -3647,7 +3649,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + + dev_info(adev->dev, "GPU reset begin!\n"); + +- cancel_delayed_work_sync(&adev->late_init_work); ++ cancel_delayed_work_sync(&adev->delayed_init_work); + + hive = amdgpu_get_xgmi_hive(adev, false); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +index dfa59c31e5e0..2cdaacef0226 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +@@ -1003,7 +1003,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) + int r, pasid; + + /* Ensure IB tests are run on ring */ +- flush_delayed_work(&adev->late_init_work); ++ flush_delayed_work(&adev->delayed_init_work); + + file_priv->driver_priv = NULL; + +-- +2.17.1 + |