diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.14.71/5195-drm-amdgpu-cleanup-GPU-recovery-check-a-bit-v2.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.14.71/5195-drm-amdgpu-cleanup-GPU-recovery-check-a-bit-v2.patch | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.14.71/5195-drm-amdgpu-cleanup-GPU-recovery-check-a-bit-v2.patch b/common/recipes-kernel/linux/linux-yocto-4.14.71/5195-drm-amdgpu-cleanup-GPU-recovery-check-a-bit-v2.patch new file mode 100644 index 00000000..47bc9ef5 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.14.71/5195-drm-amdgpu-cleanup-GPU-recovery-check-a-bit-v2.patch @@ -0,0 +1,193 @@ +From 2f5d61b081ee4f06981c1596a19ee53bcbbab2e0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com> +Date: Tue, 21 Aug 2018 10:45:29 +0200 +Subject: [PATCH 5195/5725] drm/amdgpu: cleanup GPU recovery check a bit (v2) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Check if we should call the function instead of providing the forced +flag. + +v2: rebase on KFD changes (Alex) + +Signed-off-by: Christian König <christian.koenig@amd.com> +Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Reviewed-by: Huang Rui <ray.huang@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +Signed-off-by: Raveendra Talabattula <raveendra.talabattula@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++- + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 3 ++- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++---------- + drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 4 ++-- + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 ++- + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 4 ++-- + drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 3 ++- + 8 files changed, 38 insertions(+), 22 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index b11832a..91be1d4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -1286,8 +1286,9 @@ int emu_soc_asic_init(struct amdgpu_device *adev); + #define amdgpu_asic_need_full_reset(adev) (adev)->asic_funcs->need_full_reset((adev)) + + /* Common functions */ ++bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev); + int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +- struct amdgpu_job* job, bool force); ++ struct amdgpu_job* job); + void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); + bool amdgpu_device_need_post(struct amdgpu_device *adev); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +index 0adee23..599cb6f 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +@@ -280,7 +280,8 @@ void amdgpu_amdkfd_gpu_reset(struct kgd_dev *kgd) + { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + +- amdgpu_device_gpu_recover(adev, NULL, false); ++ if (amdgpu_device_should_recover_gpu(adev)) ++ amdgpu_device_gpu_recover(adev, NULL); + } + + u32 pool_to_domain(enum kgd_memory_pool p) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 1acb85e..67adfb4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -3304,31 +3304,43 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, + } + + /** ++ * amdgpu_device_should_recover_gpu - check if we should try GPU recovery ++ * ++ * @adev: amdgpu device pointer ++ * ++ * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover ++ * a hung GPU. ++ */ ++bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) ++{ ++ if (!amdgpu_device_ip_check_soft_reset(adev)) { ++ DRM_INFO("Timeout, but no hardware hang detected.\n"); ++ return false; ++ } ++ ++ if (amdgpu_gpu_recovery == 0 || (amdgpu_gpu_recovery == -1 && ++ !amdgpu_sriov_vf(adev))) { ++ DRM_INFO("GPU recovery disabled.\n"); ++ return false; ++ } ++ ++ return true; ++} ++ ++/** + * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * + * @adev: amdgpu device pointer + * @job: which job trigger hang +- * @force: forces reset regardless of amdgpu_gpu_recovery + * + * Attempt to reset the GPU if it has hung (all asics). + * Returns 0 for success or an error on failure. + */ + int amdgpu_device_gpu_recover(struct amdgpu_device *adev, +- struct amdgpu_job *job, bool force) ++ struct amdgpu_job *job) + { + int i, r, resched; + +- if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { +- DRM_INFO("No hardware hang detected. Did some blocks stall?\n"); +- return 0; +- } +- +- if (!force && (amdgpu_gpu_recovery == 0 || +- (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) { +- DRM_INFO("GPU recovery disabled.\n"); +- return 0; +- } +- + dev_info(adev->dev, "GPU reset begin!\n"); + + mutex_lock(&adev->lock_reset); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +index 1ec9590..e7f6389 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +@@ -702,7 +702,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) + struct amdgpu_device *adev = dev->dev_private; + + seq_printf(m, "gpu recover\n"); +- amdgpu_device_gpu_recover(adev, NULL, true); ++ amdgpu_device_gpu_recover(adev, NULL); + + return 0; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +index da8eda8..2d29753 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +@@ -105,8 +105,8 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work) + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, + reset_work); + +- if (!amdgpu_sriov_vf(adev)) +- amdgpu_device_gpu_recover(adev, NULL, false); ++ if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev)) ++ amdgpu_device_gpu_recover(adev, NULL); + } + + /** +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +index 1250aae..994b569 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +@@ -37,7 +37,8 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) + job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), + ring->fence_drv.sync_seq); + +- amdgpu_device_gpu_recover(ring->adev, job, false); ++ if (amdgpu_device_should_recover_gpu(ring->adev)) ++ amdgpu_device_gpu_recover(ring->adev, job); + } + + int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, +diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +index 078f70f..8cbb465 100644 +--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c ++++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +@@ -266,8 +266,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) + } + + /* Trigger recovery for world switch failure if no TDR */ +- if (amdgpu_lockup_timeout == 0) +- amdgpu_device_gpu_recover(adev, NULL, true); ++ if (amdgpu_device_should_recover_gpu(adev)) ++ amdgpu_device_gpu_recover(adev, NULL); + } + + static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, +diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +index 9fc1c37..842567b 100644 +--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c ++++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +@@ -521,7 +521,8 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) + } + + /* Trigger recovery due to world switch failure */ +- amdgpu_device_gpu_recover(adev, NULL, false); ++ if (amdgpu_device_should_recover_gpu(adev)) ++ amdgpu_device_gpu_recover(adev, NULL); + } + + static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev, +-- +2.7.4 + |