aboutsummaryrefslogtreecommitdiffstats
path: root/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch
diff options
context:
space:
mode:
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch')
-rw-r--r--meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch163
1 files changed, 163 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch
new file mode 100644
index 00000000..eb253044
--- /dev/null
+++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.19.8/3752-drm-amdgpu-save-umc-error-records.patch
@@ -0,0 +1,163 @@
+From c251a1c24acb293667147ac950d5800e07abb490 Mon Sep 17 00:00:00 2001
+From: Tao Zhou <tao.zhou1@amd.com>
+Date: Thu, 15 Aug 2019 16:15:08 +0800
+Subject: [PATCH 3752/4256] drm/amdgpu: save umc error records
+
+save umc error records to ras bad page array
+
+v2: add bad pages before gpu reset
+v3: add NULL check for adev->umc.funcs
+
+Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
+Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
+Reviewed-by: Guchun Chen <guchun.chen@amd.com>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +-
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 40 +++++++++++++++++++------
+ drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 39 +++++++++++++++++++-----
+ 3 files changed, 64 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+index bc1d45971607..96210e18191e 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+@@ -347,7 +347,7 @@ struct ras_err_data {
+ unsigned long ue_count;
+ unsigned long ce_count;
+ unsigned long err_addr_cnt;
+- uint64_t *err_addr;
++ struct eeprom_table_record *err_addr;
+ };
+
+ struct ras_err_handler_data {
+diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+index d5c18deb407a..7a7068da02dd 100644
+--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+@@ -243,21 +243,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
+ struct ras_err_data *err_data,
+ struct amdgpu_iv_entry *entry)
+ {
+- if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+- if (adev->umc.funcs->query_ras_error_count)
+- adev->umc.funcs->query_ras_error_count(adev, err_data);
++ if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
++ return AMDGPU_RAS_SUCCESS;
++
++ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
++ if (adev->umc.funcs &&
++ adev->umc.funcs->query_ras_error_count)
++ adev->umc.funcs->query_ras_error_count(adev, err_data);
++
++ if (adev->umc.funcs &&
++ adev->umc.funcs->query_ras_error_address &&
++ adev->umc.max_ras_err_cnt_per_query) {
++ err_data->err_addr =
++ kcalloc(adev->umc.max_ras_err_cnt_per_query,
++ sizeof(struct eeprom_table_record), GFP_KERNEL);
++ /* still call query_ras_error_address to clear error status
++ * even NOMEM error is encountered
++ */
++ if(!err_data->err_addr)
++ DRM_WARN("Failed to alloc memory for umc error address record!\n");
++
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+- if (adev->umc.funcs->query_ras_error_address)
+- adev->umc.funcs->query_ras_error_address(adev, err_data);
++ adev->umc.funcs->query_ras_error_address(adev, err_data);
++ }
++
++ /* only uncorrectable error needs gpu reset */
++ if (err_data->ue_count) {
++ if (err_data->err_addr_cnt &&
++ amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
++ err_data->err_addr_cnt))
++ DRM_WARN("Failed to add ras bad page!\n");
+
+- /* only uncorrectable error needs gpu reset */
+- if (err_data->ue_count)
+- amdgpu_ras_reset_gpu(adev, 0);
++ amdgpu_ras_reset_gpu(adev, 0);
+ }
+
++ kfree(err_data->err_addr);
+ return AMDGPU_RAS_SUCCESS;
+ }
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+index 8502e736f721..09e316a22f1a 100644
+--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
++++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+@@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
+ RSMU_UMC_INDEX_MODE_EN, 0);
+ }
+
++static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
++{
++ uint32_t rsmu_umc_index;
++
++ rsmu_umc_index = RREG32_SOC15(RSMU, 0,
++ mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
++ return REG_GET_FIELD(rsmu_umc_index,
++ RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
++ RSMU_UMC_INDEX_INSTANCE);
++}
++
+ static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
+ uint32_t umc_reg_offset,
+ unsigned long *error_count)
+@@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
+ uint32_t umc_reg_offset, uint32_t channel_index)
+ {
+ uint32_t lsb, mc_umc_status_addr;
+- uint64_t mc_umc_status, err_addr;
++ uint64_t mc_umc_status, err_addr, retired_page;
++ struct eeprom_table_record *err_rec;
+
+ mc_umc_status_addr =
+ SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
+@@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
+ return;
+ }
+
++ err_rec = &err_data->err_addr[err_data->err_addr_cnt];
+ mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
+
+ /* calculate error address if ue/ce error is detected */
+@@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
+ err_addr &= ~((0x1ULL << lsb) - 1);
+
+ /* translate umc channel address to soc pa, 3 parts are included */
+- err_data->err_addr[err_data->err_addr_cnt] =
+- ADDR_OF_8KB_BLOCK(err_addr) |
+- ADDR_OF_256B_BLOCK(channel_index) |
+- OFFSET_IN_256B_BLOCK(err_addr);
+-
+- err_data->err_addr_cnt++;
++ retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
++ ADDR_OF_256B_BLOCK(channel_index) |
++ OFFSET_IN_256B_BLOCK(err_addr);
++
++ /* we only save ue error information currently, ce is skipped */
++ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
++ == 1) {
++ err_rec->address = err_addr;
++ /* page frame address is saved */
++ err_rec->retired_page = retired_page >> PAGE_SHIFT;
++ err_rec->ts = (uint64_t)ktime_get_real_seconds();
++ err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
++ err_rec->cu = 0;
++ err_rec->mem_channel = channel_index;
++ err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
++
++ err_data->err_addr_cnt++;
++ }
+ }
+
+ /* clear umc status */
+--
+2.17.1
+