diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3751-drm-amdgpu-Hook-EEPROM-table-to-RAS.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3751-drm-amdgpu-Hook-EEPROM-table-to-RAS.patch | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3751-drm-amdgpu-Hook-EEPROM-table-to-RAS.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3751-drm-amdgpu-Hook-EEPROM-table-to-RAS.patch new file mode 100644 index 00000000..907d2a11 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3751-drm-amdgpu-Hook-EEPROM-table-to-RAS.patch @@ -0,0 +1,181 @@ +From 22b36201f07bdc613ce880ae7ddd3aebe1eb10a3 Mon Sep 17 00:00:00 2001 +From: Tao Zhou <tao.zhou1@amd.com> +Date: Thu, 15 Aug 2019 14:55:55 +0800 +Subject: [PATCH 3751/4256] drm/amdgpu: Hook EEPROM table to RAS + +support eeprom records load and save for ras, +move EEPROM records storing to bad page reserving + +v2: remove redundant check for con->eh_data + +Signed-off-by: Tao Zhou <tao.zhou1@amd.com> +Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Reviewed-by: Guchun Chen <guchun.chen@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 109 ++++++++++++++++++------ + 1 file changed, 81 insertions(+), 28 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index c6f4c01b98a8..e68f43d1cfea 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -1364,6 +1364,69 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, + return ret; + } + ++/* ++ * write error record array to eeprom, the function should be ++ * protected by recovery_lock ++ */ ++static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) ++{ ++ struct amdgpu_ras *con = amdgpu_ras_get_context(adev); ++ struct ras_err_handler_data *data; ++ struct amdgpu_ras_eeprom_control *control = ++ &adev->psp.ras.ras->eeprom_control; ++ int save_count; ++ ++ if (!con || !con->eh_data) ++ return 0; ++ ++ data = con->eh_data; ++ save_count = data->count - control->num_recs; ++ /* only new entries are saved */ ++ if (save_count > 0) ++ if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control, ++ &data->bps[control->num_recs], ++ true, ++ save_count)) { ++ DRM_ERROR("Failed to save EEPROM table data!"); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++/* ++ * read error record array in eeprom and reserve enough space for ++ * storing new bad pages ++ */ ++static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) ++{ ++ struct amdgpu_ras_eeprom_control *control = ++ &adev->psp.ras.ras->eeprom_control; ++ struct eeprom_table_record *bps = NULL; ++ int ret = 0; ++ ++ /* no bad page record, skip eeprom access */ ++ if (!control->num_recs) ++ return ret; ++ ++ bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); ++ if (!bps) ++ return -ENOMEM; ++ ++ if (amdgpu_ras_eeprom_process_recods(control, bps, false, ++ control->num_recs)) { ++ DRM_ERROR("Failed to load EEPROM table records!"); ++ ret = -EIO; ++ goto out; ++ } ++ ++ ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); ++ ++out: ++ kfree(bps); ++ return ret; ++} ++ + /* called in gpu recovery/init */ + int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) + { +@@ -1371,7 +1434,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) + struct ras_err_handler_data *data; + uint64_t bp; + struct amdgpu_bo *bo; +- int i; ++ int i, ret = 0; + + if (!con || !con->eh_data) + return 0; +@@ -1391,9 +1454,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) + data->bps_bo[i] = bo; + data->last_reserved = i + 1; + } ++ ++ /* continue to save bad pages to eeprom even reesrve_vram fails */ ++ ret = amdgpu_ras_save_bad_pages(adev); + out: + mutex_unlock(&con->recovery_lock); +- return 0; ++ return ret; + } + + /* called when driver unload */ +@@ -1425,33 +1491,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) + return 0; + } + +-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +-{ +- /* TODO +- * write the array to eeprom when SMU disabled. +- */ +- return 0; +-} +- +-/* +- * read error record array in eeprom and reserve enough space for +- * storing new bad pages +- */ +-static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) +-{ +- struct eeprom_table_record *bps = NULL; +- int ret; +- +- ret = amdgpu_ras_add_bad_pages(adev, bps, +- adev->umc.max_ras_err_cnt_per_query); +- +- return ret; +-} +- + static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) + { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data **data = &con->eh_data; ++ int ret; + + *data = kmalloc(sizeof(**data), + GFP_KERNEL|__GFP_ZERO); +@@ -1463,8 +1507,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) + atomic_set(&con->in_recovery, 0); + con->adev = adev; + +- amdgpu_ras_load_bad_pages(adev); +- amdgpu_ras_reserve_bad_pages(adev); ++ ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); ++ if (ret) ++ return ret; ++ ++ if (adev->psp.ras.ras->eeprom_control.num_recs) { ++ ret = amdgpu_ras_load_bad_pages(adev); ++ if (ret) ++ return ret; ++ ret = amdgpu_ras_reserve_bad_pages(adev); ++ if (ret) ++ return ret; ++ } + + return 0; + } +@@ -1475,7 +1529,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) + struct ras_err_handler_data *data = con->eh_data; + + cancel_work_sync(&con->recovery_work); +- amdgpu_ras_save_bad_pages(adev); + amdgpu_ras_release_bad_pages(adev); + + mutex_lock(&con->recovery_lock); +-- +2.17.1 + |