diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-5.15/0042-drm-amdgpu-Register-MCE-notifier-for-Aldebaran-RAS.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-5.15/0042-drm-amdgpu-Register-MCE-notifier-for-Aldebaran-RAS.patch | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-5.15/0042-drm-amdgpu-Register-MCE-notifier-for-Aldebaran-RAS.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-5.15/0042-drm-amdgpu-Register-MCE-notifier-for-Aldebaran-RAS.patch new file mode 100644 index 00000000..6a0c8ce0 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-5.15/0042-drm-amdgpu-Register-MCE-notifier-for-Aldebaran-RAS.patch @@ -0,0 +1,194 @@ +From fe2e0beb9e86f615af44d33f84f684fc132d56d6 Mon Sep 17 00:00:00 2001 +From: Mukul Joshi <mukul.joshi@amd.com> +Date: Wed, 22 Sep 2021 14:49:43 -0400 +Subject: [PATCH 42/86] drm/amdgpu: Register MCE notifier for Aldebaran RAS + +commit 12b2cab79017ebe598c74493ac1cfc5934d3ccc2 upstream + +On Aldebaran, GPU driver will handle bad page retirement +for GPU memory even though UMC is host managed. As a result, +register a bad page retirement handler on the mce notifier +chain to retire bad pages on Aldebaran. + +Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> +Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com> +Signed-off-by: Alex Deucher <alexander.deucher@amd.com> +Signed-off-by: Zhaolong Zhang <zhaolong.zhang@windriver.com> +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 141 ++++++++++++++++++++++++ + 1 file changed, 141 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +index 96a8fd0ca1df..6660327c7c50 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +@@ -35,7 +35,11 @@ + #include "amdgpu_xgmi.h" + #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" + #include "atom.h" ++#ifdef CONFIG_X86_MCE_AMD ++#include <asm/mce.h> + ++static bool notifier_registered; ++#endif + static const char *RAS_FS_NAME = "ras"; + + const char *ras_error_string[] = { +@@ -85,6 +89,9 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, + uint64_t addr); + static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + uint64_t addr); ++#ifdef CONFIG_X86_MCE_AMD ++static void amdgpu_register_bad_pages_mca_notifier(void); ++#endif + + void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) + { +@@ -2014,6 +2021,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) + adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); + } + ++#ifdef CONFIG_X86_MCE_AMD ++ if ((adev->asic_type == CHIP_ALDEBARAN) && ++ (adev->gmc.xgmi.connected_to_cpu)) ++ amdgpu_register_bad_pages_mca_notifier(); ++#endif + return 0; + + free: +@@ -2507,3 +2519,132 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) + kfree(con); + } + } ++ ++#ifdef CONFIG_X86_MCE_AMD ++static struct amdgpu_device *find_adev(uint32_t node_id) ++{ ++ struct amdgpu_gpu_instance *gpu_instance; ++ int i; ++ struct amdgpu_device *adev = NULL; ++ ++ mutex_lock(&mgpu_info.mutex); ++ ++ for (i = 0; i < mgpu_info.num_gpu; i++) { ++ gpu_instance = &(mgpu_info.gpu_ins[i]); ++ adev = gpu_instance->adev; ++ ++ if (adev->gmc.xgmi.connected_to_cpu && ++ adev->gmc.xgmi.physical_node_id == node_id) ++ break; ++ adev = NULL; ++ } ++ ++ mutex_unlock(&mgpu_info.mutex); ++ ++ return adev; ++} ++ ++#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF) ++#define GET_UMC_INST(m) (((m) >> 21) & 0x7) ++#define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4)) ++#define GPU_ID_OFFSET 8 ++ ++static int amdgpu_bad_page_notifier(struct notifier_block *nb, ++ unsigned long val, void *data) ++{ ++ struct mce *m = (struct mce *)data; ++ struct amdgpu_device *adev = NULL; ++ uint32_t gpu_id = 0; ++ uint32_t umc_inst = 0; ++ uint32_t ch_inst, channel_index = 0; ++ struct ras_err_data err_data = {0, 0, 0, NULL}; ++ struct eeprom_table_record err_rec; ++ uint64_t retired_page; ++ ++ /* ++ * If the error was generated in UMC_V2, which belongs to GPU UMCs, ++ * and error occurred in DramECC (Extended error code = 0) then only ++ * process the error, else bail out. ++ */ ++ if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) && ++ (XEC(m->status, 0x3f) == 0x0))) ++ return NOTIFY_DONE; ++ ++ /* ++ * If it is correctable error, return. ++ */ ++ if (mce_is_correctable(m)) ++ return NOTIFY_OK; ++ ++ /* ++ * GPU Id is offset by GPU_ID_OFFSET in MCA_IPID_UMC register. ++ */ ++ gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET; ++ ++ adev = find_adev(gpu_id); ++ if (!adev) { ++ DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__, ++ gpu_id); ++ return NOTIFY_DONE; ++ } ++ ++ /* ++ * If it is uncorrectable error, then find out UMC instance and ++ * channel index. ++ */ ++ umc_inst = GET_UMC_INST(m->ipid); ++ ch_inst = GET_CHAN_INDEX(m->ipid); ++ ++ dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", ++ umc_inst, ch_inst); ++ ++ memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); ++ ++ /* ++ * Translate UMC channel address to Physical address ++ */ ++ channel_index = ++ adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num ++ + ch_inst]; ++ ++ retired_page = ADDR_OF_8KB_BLOCK(m->addr) | ++ ADDR_OF_256B_BLOCK(channel_index) | ++ OFFSET_IN_256B_BLOCK(m->addr); ++ ++ err_rec.address = m->addr; ++ err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT; ++ err_rec.ts = (uint64_t)ktime_get_real_seconds(); ++ err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; ++ err_rec.cu = 0; ++ err_rec.mem_channel = channel_index; ++ err_rec.mcumc_id = umc_inst; ++ ++ err_data.err_addr = &err_rec; ++ err_data.err_addr_cnt = 1; ++ ++ if (amdgpu_bad_page_threshold != 0) { ++ amdgpu_ras_add_bad_pages(adev, err_data.err_addr, ++ err_data.err_addr_cnt); ++ amdgpu_ras_save_bad_pages(adev); ++ } ++ ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block amdgpu_bad_page_nb = { ++ .notifier_call = amdgpu_bad_page_notifier, ++ .priority = MCE_PRIO_UC, ++}; ++ ++static void amdgpu_register_bad_pages_mca_notifier(void) ++{ ++ /* ++ * Register the x86 notifier only once ++ * with MCE subsystem. ++ */ ++ if (notifier_registered == false) { ++ mce_register_decode_chain(&amdgpu_bad_page_nb); ++ notifier_registered = true; ++ } ++} ++#endif +-- +2.37.3 + |