diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3624-drm-amdgpu-cleanups-for-vram-lost-handling.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3624-drm-amdgpu-cleanups-for-vram-lost-handling.patch | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3624-drm-amdgpu-cleanups-for-vram-lost-handling.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3624-drm-amdgpu-cleanups-for-vram-lost-handling.patch new file mode 100644 index 00000000..a0dcfbce --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3624-drm-amdgpu-cleanups-for-vram-lost-handling.patch @@ -0,0 +1,270 @@ +From 14b62f16474e181a70e786caee45908cd6a45fdf Mon Sep 17 00:00:00 2001 +From: Monk Liu <Monk.Liu@amd.com> +Date: Mon, 25 Dec 2017 11:59:27 +0800 +Subject: [PATCH 3624/4131] drm/amdgpu: cleanups for vram lost handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +1)create a routine "handle_vram_lost" to do the vram +recovery, and put it into amdgpu_device_reset/reset_sriov, +this way no need of the extra paramter to hold the +VRAM LOST information and the related macros can be removed. + +3)show vram_recover failure if time out, and set TMO equal to +lockup_timeout if vram_recover is under SRIOV runtime mode. + +4)report error if any ip reset failed for SR-IOV + +Change-Id: I686e2b6133844c14948c206a2315c064a78c1d9c +Signed-off-by: Monk Liu <Monk.Liu@amd.com> +Acked-by: Christian König <christian.koenig@amd.com> +Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Signed-off-by: Kalyan Alle <kalyan.alle@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 - + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 138 +++++++++++++++-------------- + 2 files changed, 72 insertions(+), 70 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 37e43fd..d784389 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -183,10 +183,6 @@ extern int amdgpu_cik_support; + #define CIK_CURSOR_WIDTH 128 + #define CIK_CURSOR_HEIGHT 128 + +-/* GPU RESET flags */ +-#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0) +-#define AMDGPU_RESET_INFO_FULLRESET (1 << 1) +- + struct amdgpu_device; + struct amdgpu_ib; + struct amdgpu_cs_parser; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 5a83045..84adc73 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -1620,6 +1620,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) + + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); ++ if (r) ++ return r; + } + } + +@@ -1653,6 +1655,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) + + r = block->version->funcs->hw_init(adev); + DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); ++ if (r) ++ return r; + } + } + +@@ -2502,17 +2506,71 @@ static int amdgpu_device_recover_vram_from_shadow(struct amdgpu_device *adev, + return r; + } + ++static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) ++{ ++ struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; ++ struct amdgpu_bo *bo, *tmp; ++ struct dma_fence *fence = NULL, *next = NULL; ++ long r = 1; ++ int i = 0; ++ long tmo; ++ ++ if (amdgpu_sriov_runtime(adev)) ++ tmo = msecs_to_jiffies(amdgpu_lockup_timeout); ++ else ++ tmo = msecs_to_jiffies(100); ++ ++ DRM_INFO("recover vram bo from shadow start\n"); ++ mutex_lock(&adev->shadow_list_lock); ++ list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { ++ next = NULL; ++ amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); ++ if (fence) { ++ r = dma_fence_wait_timeout(fence, false, tmo); ++ if (r == 0) ++ pr_err("wait fence %p[%d] timeout\n", fence, i); ++ else if (r < 0) ++ pr_err("wait fence %p[%d] interrupted\n", fence, i); ++ if (r < 1) { ++ dma_fence_put(fence); ++ fence = next; ++ break; ++ } ++ i++; ++ } ++ ++ dma_fence_put(fence); ++ fence = next; ++ } ++ mutex_unlock(&adev->shadow_list_lock); ++ ++ if (fence) { ++ r = dma_fence_wait_timeout(fence, false, tmo); ++ if (r == 0) ++ pr_err("wait fence %p[%d] timeout\n", fence, i); ++ else if (r < 0) ++ pr_err("wait fence %p[%d] interrupted\n", fence, i); ++ ++ } ++ dma_fence_put(fence); ++ ++ if (r > 0) ++ DRM_INFO("recover vram bo from shadow done\n"); ++ else ++ DRM_ERROR("recover vram bo from shadow failed\n"); ++ ++ return (r > 0?0:1); ++} ++ + /* + * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough + * + * @adev: amdgpu device pointer +- * @reset_flags: output param tells caller the reset result + * + * attempt to do soft-reset or full-reset and reinitialize Asic + * return 0 means successed otherwise failed + */ +-static int amdgpu_device_reset(struct amdgpu_device *adev, +- uint64_t* reset_flags) ++static int amdgpu_device_reset(struct amdgpu_device *adev) + { + bool need_full_reset, vram_lost = 0; + int r; +@@ -2527,7 +2585,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, + DRM_INFO("soft reset failed, will fallback to full reset!\n"); + need_full_reset = true; + } +- + } + + if (need_full_reset) { +@@ -2576,13 +2633,8 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, + } + } + +- if (reset_flags) { +- if (vram_lost) +- (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; +- +- if (need_full_reset) +- (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; +- } ++ if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) ++ r = amdgpu_device_handle_vram_lost(adev); + + return r; + } +@@ -2591,14 +2643,11 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, + * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf + * + * @adev: amdgpu device pointer +- * @reset_flags: output param tells caller the reset result + * + * do VF FLR and reinitialize Asic + * return 0 means successed otherwise failed + */ +-static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, +- uint64_t *reset_flags, +- bool from_hypervisor) ++static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) + { + int r; + +@@ -2619,27 +2668,18 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, + + /* now we are okay to resume SMC/CP/SDMA */ + r = amdgpu_device_ip_reinit_late_sriov(adev); ++ amdgpu_virt_release_full_gpu(adev, true); + if (r) + goto error; + + amdgpu_irq_gpu_reset_resume_helper(adev); + r = amdgpu_ib_ring_tests(adev); +- if (r) +- dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); ++ if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { ++ atomic_inc(&adev->vram_lost_counter); ++ r = amdgpu_device_handle_vram_lost(adev); ++ } + + error: +- /* release full control of GPU after ib test */ +- amdgpu_virt_release_full_gpu(adev, true); +- +- if (reset_flags) { +- if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { +- (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; +- atomic_inc(&adev->vram_lost_counter); +- } +- +- /* VF FLR or hotlink reset is always full-reset */ +- (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; +- } + + return r; + } +@@ -2658,7 +2698,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job, bool force) + { + struct drm_atomic_state *state = NULL; +- uint64_t reset_flags = 0; + int i, r, resched; + + if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { +@@ -2706,42 +2745,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + } + + if (amdgpu_sriov_vf(adev)) +- r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); +- else +- r = amdgpu_device_reset(adev, &reset_flags); +- +- if (!r) { +- if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || +- (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { +- struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; +- struct amdgpu_bo *bo, *tmp; +- struct dma_fence *fence = NULL, *next = NULL; +- +- DRM_INFO("recover vram bo from shadow\n"); +- mutex_lock(&adev->shadow_list_lock); +- list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { +- next = NULL; +- amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); +- if (fence) { +- r = dma_fence_wait(fence, false); +- if (r) { +- WARN(r, "recovery from shadow isn't completed\n"); +- break; +- } +- } +- +- dma_fence_put(fence); +- fence = next; +- } +- mutex_unlock(&adev->shadow_list_lock); +- if (fence) { +- r = dma_fence_wait(fence, false); +- if (r) +- WARN(r, "recovery from shadow isn't completed\n"); +- } +- dma_fence_put(fence); +- } +- } ++ r = amdgpu_device_reset_sriov(adev, job ? false : true); ++ else ++ r = amdgpu_device_reset(adev); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; +-- +2.7.4 + |