diff options
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch')
-rw-r--r-- | common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch new file mode 100644 index 00000000..76fa3b70 --- /dev/null +++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch @@ -0,0 +1,103 @@ +From 1357825b6905bcf665161dc41b764a83b21954e9 Mon Sep 17 00:00:00 2001 +From: Tony Luck <tony.luck@intel.com> +Date: Fri, 22 Jun 2018 11:54:23 +0200 +Subject: [PATCH 09/10] x86/mce: Fix incorrect "Machine check from unknown + source" message + +commit 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 upstream. + +Some injection testing resulted in the following console log: + + mce: [Hardware Error]: CPU 22: Machine Check Exception: f Bank 1: bd80000000100134 + mce: [Hardware Error]: RIP 10:<ffffffffc05292dd> {pmem_do_bvec+0x11d/0x330 [nd_pmem]} + mce: [Hardware Error]: TSC c51a63035d52 ADDR 3234bc4000 MISC 88 + mce: [Hardware Error]: PROCESSOR 0:50654 TIME 1526502199 SOCKET 0 APIC 38 microcode 2000043 + mce: [Hardware Error]: Run the above through 'mcelog --ascii' + Kernel panic - not syncing: Machine check from unknown source + +This confused everybody because the first line quite clearly shows +that we found a logged error in "Bank 1", while the last line says +"unknown source". + +The problem is that the Linux code doesn't do the right thing +for a local machine check that results in a fatal error. + +It turns out that we know very early in the handler whether the +machine check is fatal. The call to mce_no_way_out() has checked +all the banks for the CPU that took the local machine check. If +it says we must crash, we can do so right away with the right +messages. + +We do scan all the banks again. This means that we might initially +not see a problem, but during the second scan find something fatal. +If this happens we print a slightly different message (so I can +see if it actually every happens). + +[ bp: Remove unneeded severity assignment. ] + +Signed-off-by: Tony Luck <tony.luck@intel.com> +Signed-off-by: Borislav Petkov <bp@suse.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: Ashok Raj <ashok.raj@intel.com> +Cc: Dan Williams <dan.j.williams@intel.com> +Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com> +Cc: linux-edac <linux-edac@vger.kernel.org> +Cc: stable@vger.kernel.org # 4.2 +Link: http://lkml.kernel.org/r/52e049a497e86fd0b71c529651def8871c804df0.1527283897.git.tony.luck@intel.com +Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> +--- + arch/x86/kernel/cpu/mcheck/mce.c | 26 ++++++++++++++++++-------- + 1 file changed, 18 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c +index 72bcd08..4711e1c 100644 +--- a/arch/x86/kernel/cpu/mcheck/mce.c ++++ b/arch/x86/kernel/cpu/mcheck/mce.c +@@ -1169,13 +1169,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) + lmce = m.mcgstatus & MCG_STATUS_LMCES; + + /* ++ * Local machine check may already know that we have to panic. ++ * Broadcast machine check begins rendezvous in mce_start() + * Go through all banks in exclusion of the other CPUs. This way we + * don't report duplicated events on shared banks because the first one +- * to see it will clear it. If this is a Local MCE, then no need to +- * perform rendezvous. ++ * to see it will clear it. + */ +- if (!lmce) ++ if (lmce) { ++ if (no_way_out) ++ mce_panic("Fatal local machine check", &m, msg); ++ } else { + order = mce_start(&no_way_out); ++ } + + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); +@@ -1251,12 +1256,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) + no_way_out = worst >= MCE_PANIC_SEVERITY; + } else { + /* +- * Local MCE skipped calling mce_reign() +- * If we found a fatal error, we need to panic here. ++ * If there was a fatal machine check we should have ++ * already called mce_panic earlier in this function. ++ * Since we re-read the banks, we might have found ++ * something new. Check again to see if we found a ++ * fatal error. We call "mce_severity()" again to ++ * make sure we have the right "msg". + */ +- if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) +- mce_panic("Machine check from unknown source", +- NULL, NULL); ++ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { ++ mce_severity(&m, cfg->tolerant, &msg, true); ++ mce_panic("Local fatal machine check!", &m, msg); ++ } + } + + /* +-- +2.7.4 + |