diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch new file mode 100644 index 00000000..eb253044 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch @@ -0,0 +1,163 @@ +From c251a1c24acb293667147ac950d5800e07abb490 Mon Sep 17 00:00:00 2001 +From: Tao Zhou <tao.zhou1@amd.com> +Date: Thu, 15 Aug 2019 16:15:08 +0800 +Subject: [PATCH 3752/4256] drm/amdgpu: save umc error records + +save umc error records to ras bad page array + +v2: add bad pages before gpu reset +v3: add NULL check for adev->umc.funcs + +Signed-off-by: Tao Zhou <tao.zhou1@amd.com> +Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> +Reviewed-by: Guchun Chen <guchun.chen@amd.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- + drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 40 +++++++++++++++++++------ + drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 39 +++++++++++++++++++----- + 3 files changed, 64 insertions(+), 17 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +index bc1d45971607..96210e18191e 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +@@ -347,7 +347,7 @@ struct ras_err_data { + unsigned long ue_count; + unsigned long ce_count; + unsigned long err_addr_cnt; +- uint64_t *err_addr; ++ struct eeprom_table_record *err_addr; + }; + + struct ras_err_handler_data { +diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +index d5c18deb407a..7a7068da02dd 100644 +--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +@@ -243,21 +243,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, + struct ras_err_data *err_data, + struct amdgpu_iv_entry *entry) + { +- if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { +- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); +- if (adev->umc.funcs->query_ras_error_count) +- adev->umc.funcs->query_ras_error_count(adev, err_data); ++ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) ++ return AMDGPU_RAS_SUCCESS; ++ ++ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); ++ if (adev->umc.funcs && ++ adev->umc.funcs->query_ras_error_count) ++ adev->umc.funcs->query_ras_error_count(adev, err_data); ++ ++ if (adev->umc.funcs && ++ adev->umc.funcs->query_ras_error_address && ++ adev->umc.max_ras_err_cnt_per_query) { ++ err_data->err_addr = ++ kcalloc(adev->umc.max_ras_err_cnt_per_query, ++ sizeof(struct eeprom_table_record), GFP_KERNEL); ++ /* still call query_ras_error_address to clear error status ++ * even NOMEM error is encountered ++ */ ++ if(!err_data->err_addr) ++ DRM_WARN("Failed to alloc memory for umc error address record!\n"); ++ + /* umc query_ras_error_address is also responsible for clearing + * error status + */ +- if (adev->umc.funcs->query_ras_error_address) +- adev->umc.funcs->query_ras_error_address(adev, err_data); ++ adev->umc.funcs->query_ras_error_address(adev, err_data); ++ } ++ ++ /* only uncorrectable error needs gpu reset */ ++ if (err_data->ue_count) { ++ if (err_data->err_addr_cnt && ++ amdgpu_ras_add_bad_pages(adev, err_data->err_addr, ++ err_data->err_addr_cnt)) ++ DRM_WARN("Failed to add ras bad page!\n"); + +- /* only uncorrectable error needs gpu reset */ +- if (err_data->ue_count) +- amdgpu_ras_reset_gpu(adev, 0); ++ amdgpu_ras_reset_gpu(adev, 0); + } + ++ kfree(err_data->err_addr); + return AMDGPU_RAS_SUCCESS; + } + +diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +index 8502e736f721..09e316a22f1a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c ++++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) + RSMU_UMC_INDEX_MODE_EN, 0); + } + ++static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev) ++{ ++ uint32_t rsmu_umc_index; ++ ++ rsmu_umc_index = RREG32_SOC15(RSMU, 0, ++ mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); ++ return REG_GET_FIELD(rsmu_umc_index, ++ RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, ++ RSMU_UMC_INDEX_INSTANCE); ++} ++ + static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, + uint32_t umc_reg_offset, + unsigned long *error_count) +@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, + uint32_t umc_reg_offset, uint32_t channel_index) + { + uint32_t lsb, mc_umc_status_addr; +- uint64_t mc_umc_status, err_addr; ++ uint64_t mc_umc_status, err_addr, retired_page; ++ struct eeprom_table_record *err_rec; + + mc_umc_status_addr = + SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); +@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, + return; + } + ++ err_rec = &err_data->err_addr[err_data->err_addr_cnt]; + mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); + + /* calculate error address if ue/ce error is detected */ +@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, + err_addr &= ~((0x1ULL << lsb) - 1); + + /* translate umc channel address to soc pa, 3 parts are included */ +- err_data->err_addr[err_data->err_addr_cnt] = +- ADDR_OF_8KB_BLOCK(err_addr) | +- ADDR_OF_256B_BLOCK(channel_index) | +- OFFSET_IN_256B_BLOCK(err_addr); +- +- err_data->err_addr_cnt++; ++ retired_page = ADDR_OF_8KB_BLOCK(err_addr) | ++ ADDR_OF_256B_BLOCK(channel_index) | ++ OFFSET_IN_256B_BLOCK(err_addr); ++ ++ /* we only save ue error information currently, ce is skipped */ ++ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) ++ == 1) { ++ err_rec->address = err_addr; ++ /* page frame address is saved */ ++ err_rec->retired_page = retired_page >> PAGE_SHIFT; ++ err_rec->ts = (uint64_t)ktime_get_real_seconds(); ++ err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; ++ err_rec->cu = 0; ++ err_rec->mem_channel = channel_index; ++ err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); ++ ++ err_data->err_addr_cnt++; ++ } + } + + /* clear umc status */ +-- +2.17.1 + |