diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3753-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3753-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch | 193 |
1 files changed, 193 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3753-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3753-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch new file mode 100644 index 00000000..52387a9f --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3753-drm-amdgpu-move-the-call-of-ras-recovery_init-and-ba.patch @@ -0,0 +1,193 @@ +From 78f24d941619c851ff2d6fdbe68a390484b58573 Mon Sep 17 00:00:00 2001 +From: Tao Zhou <tao.zhou1@amd.com> +Date: Fri, 30 Aug 2019 19:50:39 +0800 +Subject: [PATCH 3753/4256] drm/amdgpu: move the call of ras recovery_init and + bad page reserve to proper place + +ras recovery_init should be called after ttm init, +bad page reserve should be put in front of gpu reset since i2c +may be unstable during gpu reset. +add cleanup for recovery_init and recovery_fini + +v2: add more comment and print. + remove cancel_work_sync in recovery_init. + +Signed-off-by: Tao Zhou <tao.zhou1@amd.com> +Reviewed-by: Guchun Chen <guchun.chen@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 --- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++-------- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++ + drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++ + 4 files changed, 43 insertions(+), 18 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index b29b26098b8f..92e01084911c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -3619,11 +3619,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + break; + } + } +- +- list_for_each_entry(tmp_adev, device_list_handle, +- gmc.xgmi.head) { +- amdgpu_ras_reserve_bad_pages(tmp_adev); +- } + } + } + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index e68f43d1cfea..d2437e13a085 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -1491,16 +1491,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) + return 0; + } + +-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ++int amdgpu_ras_recovery_init(struct amdgpu_device *adev) + { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data **data = &con->eh_data; + int ret; + +- *data = kmalloc(sizeof(**data), +- GFP_KERNEL|__GFP_ZERO); +- if (!*data) +- return -ENOMEM; ++ *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); ++ if (!*data) { ++ ret = -ENOMEM; ++ goto out; ++ } + + mutex_init(&con->recovery_lock); + INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); +@@ -1509,18 +1510,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) + + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); + if (ret) +- return ret; ++ goto free; + + if (adev->psp.ras.ras->eeprom_control.num_recs) { + ret = amdgpu_ras_load_bad_pages(adev); + if (ret) +- return ret; ++ goto free; + ret = amdgpu_ras_reserve_bad_pages(adev); + if (ret) +- return ret; ++ goto release; + } + + return 0; ++ ++release: ++ amdgpu_ras_release_bad_pages(adev); ++free: ++ con->eh_data = NULL; ++ kfree((*data)->bps); ++ kfree((*data)->bps_bo); ++ kfree(*data); ++out: ++ DRM_WARN("Failed to initialize ras recovery!\n"); ++ ++ return ret; + } + + static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) +@@ -1528,12 +1541,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data = con->eh_data; + ++ /* recovery_init failed to init it, fini is useless */ ++ if (!data) ++ return 0; ++ + cancel_work_sync(&con->recovery_work); + amdgpu_ras_release_bad_pages(adev); + + mutex_lock(&con->recovery_lock); + con->eh_data = NULL; + kfree(data->bps); ++ kfree(data->bps_bo); + kfree(data); + mutex_unlock(&con->recovery_lock); + +@@ -1625,9 +1643,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) + return r; + } + +- if (amdgpu_ras_recovery_init(adev)) +- goto recovery_out; +- + amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; + + if (amdgpu_ras_fs_init(adev)) +@@ -1642,8 +1657,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) + con->hw_supported, con->supported); + return 0; + fs_out: +- amdgpu_ras_recovery_fini(adev); +-recovery_out: + amdgpu_ras_set_context(adev, NULL); + kfree(con); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +index 96210e18191e..012034d2ae06 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, + return ras && (ras->supported & (1 << block)); + } + ++int amdgpu_ras_recovery_init(struct amdgpu_device *adev); + int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, + unsigned int block); + +@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, + { + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + ++ /* save bad page to eeprom before gpu reset, ++ * i2c may be unstable in gpu reset ++ */ ++ amdgpu_ras_reserve_bad_pages(adev); + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) + schedule_work(&ras->recovery_work); + return 0; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +index 7377bff42335..7c3025abd387 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +@@ -49,6 +49,7 @@ + #include "amdgpu_trace.h" + #include "amdgpu_amdkfd.h" + #include "amdgpu_sdma.h" ++#include "amdgpu_ras.h" + #include "bif/bif_4_1_d.h" + #include "amdgpu_amdkfd.h" + +@@ -2090,6 +2091,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) + adev->gmc.visible_vram_size); + #endif + ++ /* ++ * retired pages will be loaded from eeprom and reserved here, ++ * it should be called after ttm init since new bo may be created, ++ * recovery_init may fail, but it can free all resources allocated by ++ * itself and its failure should not stop amdgpu init process. ++ * ++ * Note: theoretically, this should be called before all vram allocations ++ * to protect retired page from abusing ++ */ ++ amdgpu_ras_recovery_init(adev); ++ + /* + *The reserved vram for firmware must be pinned to the specified + *place on the VRAM, so reserve it early. +-- +2.17.1 + |