diff options
authorDaniel J Blueman <daniel@numascale.com>2015-02-17 11:34:38 +0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2015-03-06 14:43:31 -0800
commita36a251710af8e53fdf252f44dba5b0fdb478d30 (patch)
parent1e3240b06aa7f5ffd5064f8b536e02285e050462 (diff)
EDAC, amd64_edac: Prevent OOPS with >16 memory controllers
commit 0c510cc83bdbaac8406f4f7caef34f4da0ba35ea upstream. When DRAM errors occur on memory controllers after EDAC_MAX_MCS (16), the kernel fatally dereferences unallocated structures, see splat below; this occurs on at least NumaConnect systems. Fix by checking if a memory controller info structure was found. BUG: unable to handle kernel NULL pointer dereference at 0000000000000320 IP: [<ffffffff819f714f>] decode_bus_error+0x2f/0x2b0 PGD 2f8b5a3067 PUD 2f8b5a2067 PMD 0 Oops: 0000 [#2] SMP Modules linked in: CPU: 224 PID: 11930 Comm: stream_c.exe.gn Tainted: G D 3.19.0 #1 Hardware name: Supermicro H8QGL/H8QGL, BIOS 3.5b 01/28/2015 task: ffff8807dbfb8c00 ti: ffff8807dd16c000 task.ti: ffff8807dd16c000 RIP: 0010:[<ffffffff819f714f>] [<ffffffff819f714f>] decode_bus_error+0x2f/0x2b0 RSP: 0000:ffff8907dfc03c48 EFLAGS: 00010297 RAX: 0000000000000001 RBX: 9c67400010080a13 RCX: 0000000000001dc6 RDX: 000000001dc61dc6 RSI: ffff8907dfc03df0 RDI: 000000000000001c RBP: ffff8907dfc03ce8 R08: 0000000000000000 R09: 0000000000000022 R10: ffff891fffa30380 R11: 00000000001cfc90 R12: 0000000000000008 R13: 0000000000000000 R14: 000000000000001c R15: 00009c6740001000 FS: 00007fa97ee18700(0000) GS:ffff8907dfc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000320 CR3: 0000003f889b8000 CR4: 00000000000407e0 Stack: 0000000000000000 ffff8907dfc03df0 0000000000000008 9c67400010080a13 000000000000001c 00009c6740001000 ffff8907dfc03c88 ffffffff810e4f9a ffff8907dfc03ce8 ffffffff81b375b9 0000000000000000 0000000000000010 Call Trace: <IRQ> ? vprintk_default ? printk amd_decode_mce notifier_call_chain atomic_notifier_call_chain mce_log machine_check_poll mce_timer_fn ? mce_cpu_restart call_timer_fn.isra.29 run_timer_softirq __do_softirq irq_exit smp_apic_timer_interrupt apic_timer_interrupt <EOI> ? down_read_trylock __do_page_fault ? __schedule do_page_fault page_fault Signed-off-by: Daniel J Blueman <daniel@numascale.com> Link: http://lkml.kernel.org/r/1424144078-24589-1-git-send-email-daniel@numascale.com [ Boris: massage commit message ] Signed-off-by: Borislav Petkov <bp@suse.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 files changed, 8 insertions, 2 deletions
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 98e14ee4833c..278603c373ca 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2006,14 +2006,20 @@ static void __log_bus_error(struct mem_ctl_info *mci, struct err_info *err,
static inline void decode_bus_error(int node_id, struct mce *m)
- struct mem_ctl_info *mci = mcis[node_id];
- struct amd64_pvt *pvt = mci->pvt_info;
+ struct mem_ctl_info *mci;
+ struct amd64_pvt *pvt;
u8 ecc_type = (m->status >> 45) & 0x3;
u8 xec = XEC(m->status, 0x1f);
u16 ec = EC(m->status);
u64 sys_addr;
struct err_info err;
+ mci = edac_mc_find(node_id);
+ if (!mci)
+ return;
+ pvt = mci->pvt_info;
/* Bail out early if this was an 'observed' error */
if (PP(ec) == NBSL_PP_OBS)