aboutsummaryrefslogtreecommitdiffstats
path: root/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch
diff options
context:
space:
mode:
Diffstat (limited to 'common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch')
-rw-r--r--common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch103
1 files changed, 103 insertions, 0 deletions
diff --git a/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch
new file mode 100644
index 00000000..76fa3b70
--- /dev/null
+++ b/common/recipes-kernel/linux/linux-yocto-4.9.21/0009-x86-mce-Fix-incorrect-Machine-check-from-unknown-sou.patch
@@ -0,0 +1,103 @@
+From 1357825b6905bcf665161dc41b764a83b21954e9 Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Fri, 22 Jun 2018 11:54:23 +0200
+Subject: [PATCH 09/10] x86/mce: Fix incorrect "Machine check from unknown
+ source" message
+
+commit 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 upstream.
+
+Some injection testing resulted in the following console log:
+
+ mce: [Hardware Error]: CPU 22: Machine Check Exception: f Bank 1: bd80000000100134
+ mce: [Hardware Error]: RIP 10:<ffffffffc05292dd> {pmem_do_bvec+0x11d/0x330 [nd_pmem]}
+ mce: [Hardware Error]: TSC c51a63035d52 ADDR 3234bc4000 MISC 88
+ mce: [Hardware Error]: PROCESSOR 0:50654 TIME 1526502199 SOCKET 0 APIC 38 microcode 2000043
+ mce: [Hardware Error]: Run the above through 'mcelog --ascii'
+ Kernel panic - not syncing: Machine check from unknown source
+
+This confused everybody because the first line quite clearly shows
+that we found a logged error in "Bank 1", while the last line says
+"unknown source".
+
+The problem is that the Linux code doesn't do the right thing
+for a local machine check that results in a fatal error.
+
+It turns out that we know very early in the handler whether the
+machine check is fatal. The call to mce_no_way_out() has checked
+all the banks for the CPU that took the local machine check. If
+it says we must crash, we can do so right away with the right
+messages.
+
+We do scan all the banks again. This means that we might initially
+not see a problem, but during the second scan find something fatal.
+If this happens we print a slightly different message (so I can
+see if it actually every happens).
+
+[ bp: Remove unneeded severity assignment. ]
+
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ashok Raj <ashok.raj@intel.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
+Cc: linux-edac <linux-edac@vger.kernel.org>
+Cc: stable@vger.kernel.org # 4.2
+Link: http://lkml.kernel.org/r/52e049a497e86fd0b71c529651def8871c804df0.1527283897.git.tony.luck@intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/mcheck/mce.c | 26 ++++++++++++++++++--------
+ 1 file changed, 18 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
+index 72bcd08..4711e1c 100644
+--- a/arch/x86/kernel/cpu/mcheck/mce.c
++++ b/arch/x86/kernel/cpu/mcheck/mce.c
+@@ -1169,13 +1169,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
+ lmce = m.mcgstatus & MCG_STATUS_LMCES;
+
+ /*
++ * Local machine check may already know that we have to panic.
++ * Broadcast machine check begins rendezvous in mce_start()
+ * Go through all banks in exclusion of the other CPUs. This way we
+ * don't report duplicated events on shared banks because the first one
+- * to see it will clear it. If this is a Local MCE, then no need to
+- * perform rendezvous.
++ * to see it will clear it.
+ */
+- if (!lmce)
++ if (lmce) {
++ if (no_way_out)
++ mce_panic("Fatal local machine check", &m, msg);
++ } else {
+ order = mce_start(&no_way_out);
++ }
+
+ for (i = 0; i < cfg->banks; i++) {
+ __clear_bit(i, toclear);
+@@ -1251,12 +1256,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+- * Local MCE skipped calling mce_reign()
+- * If we found a fatal error, we need to panic here.
++ * If there was a fatal machine check we should have
++ * already called mce_panic earlier in this function.
++ * Since we re-read the banks, we might have found
++ * something new. Check again to see if we found a
++ * fatal error. We call "mce_severity()" again to
++ * make sure we have the right "msg".
+ */
+- if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+- mce_panic("Machine check from unknown source",
+- NULL, NULL);
++ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
++ mce_severity(&m, cfg->tolerant, &msg, true);
++ mce_panic("Local fatal machine check!", &m, msg);
++ }
+ }
+
+ /*
+--
+2.7.4
+