aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/alternative.c44
-rw-r--r--arch/x86/kernel/apic/io_apic.c15
-rw-r--r--arch/x86/kernel/apic/msi.c18
-rw-r--r--arch/x86/kernel/apic/vector.c27
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c122
-rw-r--r--arch/x86/kernel/cpu/bugs.c19
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/common.c46
-rw-r--r--arch/x86/kernel/cpu/cpu.h4
-rw-r--r--arch/x86/kernel/cpu/intel.c13
-rw-r--r--arch/x86/kernel/cpu/mce/core.c15
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c2
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c1
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c1
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/fpu/core.c45
-rw-r--r--arch/x86/kernel/fpu/xstate.c91
-rw-r--r--arch/x86/kernel/ftrace.c10
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c9
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c36
-rw-r--r--arch/x86/kernel/kprobes/opt.c40
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/ldt.c26
-rw-r--r--arch/x86/kernel/msr.c69
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/probe_roms.c24
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/process_32.c29
-rw-r--r--arch/x86/kernel/process_64.c190
-rw-r--r--arch/x86/kernel/ptrace.c60
-rw-r--r--arch/x86/kernel/quirks.c10
-rw-r--r--arch/x86/kernel/signal.c3
-rw-r--r--arch/x86/kernel/smpboot.c64
-rw-r--r--arch/x86/kernel/stacktrace.c5
-rw-r--r--arch/x86/kernel/sys_ia32.c3
-rw-r--r--arch/x86/kernel/traps.c136
-rw-r--r--arch/x86/kernel/unwind_frame.c4
-rw-r--r--arch/x86/kernel/unwind_orc.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
48 files changed, 848 insertions, 474 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8fd39ff74a49..c826cddae157 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
@@ -15,6 +16,7 @@
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/bsearch.h>
+#include <linux/sync_core.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -53,7 +55,7 @@ __setup("noreplace-smp", setup_noreplace_smp);
#define DPRINTK(fmt, args...) \
do { \
if (debug_alternative) \
- printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
} while (0)
#define DUMP_BYTES(buf, len, fmt, args...) \
@@ -64,7 +66,7 @@ do { \
if (!(len)) \
break; \
\
- printk(KERN_DEBUG fmt, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
@@ -1001,6 +1003,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 old;
};
struct bp_patching_desc {
@@ -1044,7 +1047,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt)
return 0;
}
-int noinstr poke_int3_handler(struct pt_regs *regs)
+noinstr int poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
@@ -1168,8 +1171,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
+ tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ }
text_poke_sync();
@@ -1177,14 +1182,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
int len = text_opcode_size(tp[i].opcode);
if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
(const char *)tp[i].text + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
do_sync++;
}
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
+ tp[i].text, len);
}
if (do_sync) {
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ce61e3e7d399..21325a4a78b9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2316,12 +2316,12 @@ static int mp_irqdomain_create(int ioapic)
ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
(void *)(long)ioapic);
- /* Release fw handle if it was allocated above */
- if (!cfg->dev)
- irq_domain_free_fwnode(fn);
-
- if (!ip->irqdomain)
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
return -ENOMEM;
+ }
ip->irqdomain->parent = parent;
@@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic)
static void ioapic_destroy_irqdomain(int idx)
{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 5cbaca58af95..c2b2911feeef 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -263,12 +263,13 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
msi_default_domain =
pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
parent);
- irq_domain_free_fwnode(fn);
}
- if (!msi_default_domain)
+ if (!msi_default_domain) {
+ irq_domain_free_fwnode(fn);
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
- else
+ } else {
msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+ }
}
#ifdef CONFIG_IRQ_REMAP
@@ -301,7 +302,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
if (!fn)
return NULL;
d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d)
+ irq_domain_free_fwnode(fn);
return d;
}
#endif
@@ -364,7 +366,8 @@ static struct irq_domain *dmar_get_irq_domain(void)
if (fn) {
dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
- irq_domain_free_fwnode(fn);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
}
out:
mutex_unlock(&dmar_lock);
@@ -489,7 +492,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
}
d = msi_create_irq_domain(fn, domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
return d;
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index c48be6e1f676..dae32d948bf2 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -446,12 +446,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
trace_vector_activate(irqd->irq, apicd->is_managed,
apicd->can_reserve, reserve);
- /* Nothing to do for fixed assigned vectors */
- if (!apicd->can_reserve && !apicd->is_managed)
- return 0;
-
raw_spin_lock_irqsave(&vector_lock, flags);
- if (reserve || irqd_is_managed_and_shutdown(irqd))
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed)
ret = activate_managed(irqd);
@@ -562,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
* as that can corrupt the affinity move state.
*/
irqd_set_handle_enforce_irqctx(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
/*
* Legacy vectors are already assigned when the IOAPIC
* takes them over. They stay on the same vector. This is
@@ -709,7 +711,6 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_domain_free_fwnode(fn);
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
@@ -775,20 +776,10 @@ void lapic_offline(void)
static int apic_set_affinity(struct irq_data *irqd,
const struct cpumask *dest, bool force)
{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
int err;
- /*
- * Core code can call here for inactive interrupts. For inactive
- * interrupts which use managed or reservation mode there is no
- * point in going through the vector assignment right now as the
- * activation will assign a vector which fits the destination
- * cpumask. Let the core code store the destination mask and be
- * done with it.
- */
- if (!irqd_is_activated(irqd) &&
- (apicd->is_managed || apicd->can_reserve))
- return IRQ_SET_MASK_OK;
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
raw_spin_lock(&vector_lock);
cpumask_and(vector_searchmask, dest, cpu_online_mask);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 69e70ed0f5e6..0b6eea3f54e6 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -24,8 +24,6 @@
#include <asm/uv/uv.h>
#include <asm/apic.h>
-static DEFINE_PER_CPU(int, x2apic_extra_bits);
-
static enum uv_system_type uv_system_type;
static int uv_hubbed_system;
static int uv_hubless_system;
@@ -40,7 +38,7 @@ static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
static struct {
unsigned int apicid_shift;
unsigned int apicid_mask;
- unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */
+ unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */
unsigned int pnode_mask;
unsigned int gpa_shift;
unsigned int gnode_shift;
@@ -48,8 +46,6 @@ static struct {
static int uv_min_hub_revision_id;
-unsigned int uv_apicid_hibits;
-
static struct apic apic_x2apic_uv_x;
static struct uv_hub_info_s uv_hub_info_node0;
@@ -139,12 +135,8 @@ static void __init uv_tsc_check_sync(void)
/* Accommodate different UV arch BIOSes */
mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR);
mmr_shift =
- is_uv1_hub() ? 0 :
is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT;
- if (mmr_shift)
- sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
- else
- sync_state = 0;
+ sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK;
switch (sync_state) {
case UVH_TSC_SYNC_VALID:
@@ -223,21 +215,6 @@ static void __init early_get_apic_socketid_shift(void)
pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
}
-/*
- * Add an extra bit as dictated by bios to the destination apicid of
- * interrupts potentially passing through the UV HUB. This prevents
- * a deadlock between interrupts and IO port operations.
- */
-static void __init uv_set_apicid_hibit(void)
-{
- union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
-
- if (is_uv1_hub()) {
- apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
- uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
- }
-}
-
static void __init uv_stringify(int len, char *to, char *from)
{
/* Relies on 'to' being NULL chars so result will be NULL terminated */
@@ -280,36 +257,25 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
/*
* Determine UV arch type.
- * SGI: UV100/1000
* SGI2: UV2000/3000
* SGI3: UV300 (truncated to 4 chars because of different varieties)
* SGI4: UV400 (truncated to 4 chars because of different varieties)
*/
- uv_hub_info->hub_revision =
- !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE :
- !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
- !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
- !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
-
- if (uv_hub_info->hub_revision == 0)
- goto badbios;
-
- switch (uv_hub_info->hub_revision) {
- case UV4_HUB_REVISION_BASE:
+ if (!strncmp(oem_id, "SGI4", 4)) {
+ uv_hub_info->hub_revision = UV4_HUB_REVISION_BASE;
uv_hubbed_system = 0x11;
- break;
- case UV3_HUB_REVISION_BASE:
+ } else if (!strncmp(oem_id, "SGI3", 4)) {
+ uv_hub_info->hub_revision = UV3_HUB_REVISION_BASE;
uv_hubbed_system = 0x9;
- break;
- case UV2_HUB_REVISION_BASE:
+ } else if (!strcmp(oem_id, "SGI2")) {
+ uv_hub_info->hub_revision = UV2_HUB_REVISION_BASE;
uv_hubbed_system = 0x5;
- break;
- case UV1_HUB_REVISION_BASE:
- uv_hubbed_system = 0x3;
- break;
+ } else {
+ uv_hub_info->hub_revision = 0;
+ goto badbios;
}
pnodeid = early_get_pnodeid();
@@ -323,14 +289,6 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
uv_system_type = UV_X2APIC;
uv_apic = 0;
- } else if (!strcmp(oem_table_id, "UVH")) {
- /* Only UV1 systems: */
- uv_system_type = UV_NON_UNIQUE_APIC;
- x86_platform.legacy.warm_reset = 0;
- __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
- uv_set_apicid_hibit();
- uv_apic = 1;
-
} else if (!strcmp(oem_table_id, "UVL")) {
/* Only used for very small systems: */
uv_system_type = UV_LEGACY_APIC;
@@ -347,7 +305,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
badbios:
pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
- pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+ pr_err("Current UV Type or BIOS not supported\n");
BUG();
}
@@ -545,7 +503,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
- phys_apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
@@ -576,7 +533,7 @@ static void uv_send_IPI_one(int cpu, int vector)
dmode = dest_Fixed;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
- ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) |
+ (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
(vector << UVH_IPI_INT_VECTOR_SHFT);
@@ -634,22 +591,16 @@ static void uv_init_apic_ldr(void)
static u32 apic_uv_calc_apicid(unsigned int cpu)
{
- return apic_default_calc_apicid(cpu) | uv_apicid_hibits;
+ return apic_default_calc_apicid(cpu);
}
-static unsigned int x2apic_get_apic_id(unsigned long x)
+static unsigned int x2apic_get_apic_id(unsigned long id)
{
- unsigned int id;
-
- WARN_ON(preemptible() && num_online_cpus() > 1);
- id = x | __this_cpu_read(x2apic_extra_bits);
-
return id;
}
static u32 set_apic_id(unsigned int id)
{
- /* CHECKME: Do we need to mask out the xapic extra bits? */
return id;
}
@@ -721,11 +672,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
-static void set_x2apic_extra_bits(int pnode)
-{
- __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
-}
-
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3
#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
@@ -920,15 +866,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
return;
}
- if (is_uv1_hub()) {
- mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
- shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- mmioh.v = uv_read_local_mmr(mmr);
- enable = !!mmioh.s1.enable;
- base = mmioh.s1.base;
- m_io = mmioh.s1.m_io;
- n_io = mmioh.s1.n_io;
- } else if (is_uv2_hub()) {
+ if (is_uv2_hub()) {
mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
mmioh.v = uv_read_local_mmr(mmr);
@@ -936,16 +874,15 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
base = mmioh.s2.base;
m_io = mmioh.s2.m_io;
n_io = mmioh.s2.n_io;
- } else {
- return;
- }
- if (enable) {
- max_pnode &= (1 << n_io) - 1;
- pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
- map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
- } else {
- pr_info("UV: MMIOH disabled\n");
+ if (enable) {
+ max_pnode &= (1 << n_io) - 1;
+ pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
+ base, shift, m_io, n_io, max_pnode);
+ map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
+ } else {
+ pr_info("UV: MMIOH disabled\n");
+ }
}
}
@@ -1081,9 +1018,6 @@ void uv_cpu_init(void)
return;
uv_hub_info->nr_online_cpus++;
-
- if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->pnode);
}
struct mn {
@@ -1114,9 +1048,6 @@ static void get_mn(struct mn *mnp)
} else if (is_uv2_hub()) {
mnp->m_val = m_n_config.s2.m_skt;
mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
- } else if (is_uv1_hub()) {
- mnp->m_val = m_n_config.s1.m_skt;
- mnp->n_lshift = mnp->m_val;
}
mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
}
@@ -1318,7 +1249,7 @@ static void __init build_socket_tables(void)
size_t bytes;
if (!gre) {
- if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
+ if (is_uv2_hub() || is_uv3_hub()) {
pr_info("UV: No UVsystab socket table, ignoring\n");
return;
}
@@ -1500,8 +1431,7 @@ static void __init uv_system_init_hub(void)
unsigned short min_pnode = 9999, max_pnode = 0;
char *hub = is_uv4_hub() ? "UV400" :
is_uv3_hub() ? "UV300" :
- is_uv2_hub() ? "UV2000/3000" :
- is_uv1_hub() ? "UV100/1000" : NULL;
+ is_uv2_hub() ? "UV2000/3000" : NULL;
if (!hub) {
pr_err("UV: Unknown/unsupported UV hub\n");
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0b71970d2d3d..f0b743a2fe9c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void)
* If FSGSBASE is enabled, the user can put a kernel address in
* GS, in which case SMAP provides no protection.
*
- * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
- * FSGSBASE enablement patches have been merged. ]
- *
* If FSGSBASE is disabled, the user can only put a user space
* address in GS. That makes an attack harder, but still
* possible if there's no SMAP protection.
*/
- if (!smap_works_speculatively()) {
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
/*
* Mitigation can be provided from SWAPGS itself or
* PTI as the CR3 write in the Meltdown mitigation
@@ -763,10 +761,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
}
/*
- * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
* required.
*/
- if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
@@ -778,12 +778,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- /*
- * If STIBP is not available, clear the STIBP mode.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- mode = SPECTRE_V2_USER_NONE;
-
spectre_v2_user_stibp = mode;
set_mode:
@@ -1270,7 +1264,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
* Indirect branch speculation is always disabled in strict
* mode. It can neither be enabled if it was force-disabled
* by a previous prctl call.
-
*/
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 426792565d86..c5cf336e5077 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 043d93cdcaad..965474d78cef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -347,6 +347,9 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask =
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -371,20 +374,20 @@ EXPORT_SYMBOL(native_write_cr0);
void native_write_cr4(unsigned long val)
{
- unsigned long bits_missing = 0;
+ unsigned long bits_changed = 0;
set_register:
asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits));
if (static_branch_likely(&cr_pinning)) {
- if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) {
- bits_missing = ~val & cr4_pinned_bits;
- val |= bits_missing;
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
goto set_register;
}
- /* Warn after we've set the missing bits. */
- WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n",
- bits_missing);
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
}
}
#if IS_MODULE(CONFIG_LKDTM)
@@ -419,7 +422,7 @@ void cr4_init(void)
if (boot_cpu_has(X86_FEATURE_PCID))
cr4 |= X86_CR4_PCIDE;
if (static_branch_likely(&cr_pinning))
- cr4 |= cr4_pinned_bits;
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
__write_cr4(cr4);
@@ -434,13 +437,26 @@ void cr4_init(void)
*/
static void __init setup_cr_pinning(void)
{
- unsigned long mask;
-
- mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP);
- cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask;
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
static_key_enable(&cr_pinning.key);
}
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
+ return 1;
+}
+__setup("nofsgsbase", x86_nofsgsbase_setup);
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1495,6 +1511,12 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smap(c);
setup_umip(c);
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index fb538fccd24c..9d033693519a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -81,8 +81,4 @@ extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
-#ifdef CONFIG_IA32_FEAT_CTL
-void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
-#endif
-
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index c25a67a34bd3..b6b7b38dff5f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -50,6 +50,13 @@ static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
static u64 msr_test_ctrl_cache __ro_after_init;
/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
@@ -1071,7 +1078,8 @@ static void sld_update_msr(bool on)
static void split_lock_init(void)
{
- split_lock_verify_msr(sld_state != sld_off);
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
}
static void split_lock_warn(unsigned long ip)
@@ -1148,6 +1156,8 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
{}
};
@@ -1177,5 +1187,6 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
return;
}
+ cpu_model_supports_sld = true;
split_lock_setup();
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c4f740..f43a78bde670 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -42,6 +42,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
#include <linux/set_memory.h>
+#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
@@ -244,6 +245,8 @@ static void __print_mce(struct mce *m)
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
+ if (m->ppin)
+ pr_cont("PPIN %llx ", m->ppin);
if (mce_flags.smca) {
if (m->synd)
@@ -1083,7 +1086,7 @@ static noinstr bool mce_check_crashing_cpu(void)
{
unsigned int cpu = smp_processor_id();
- if (cpu_is_offline(cpu) ||
+ if (arch_cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
@@ -1212,7 +1215,7 @@ static void kill_me_maybe(struct callback_head *cb)
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
-void noinstr do_machine_check(struct pt_regs *regs)
+noinstr void do_machine_check(struct pt_regs *regs)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
@@ -1901,6 +1904,8 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
+ WARN_ON_ONCE(user_mode(regs));
+
/*
* Only required when from kernel mode. See
* mce_check_crashing_cpu() for details.
@@ -1925,11 +1930,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
- idtentry_enter_user(regs);
+ irqentry_enter_from_user_mode(regs);
instrumentation_begin();
machine_check_vector(regs);
instrumentation_end();
- idtentry_exit_user(regs);
+ irqentry_exit_to_user_mode(regs);
}
#ifdef CONFIG_X86_64
@@ -1954,7 +1959,7 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
}
#else
/* 32bit unified entry point */
-DEFINE_IDTENTRY_MCE(exc_machine_check)
+DEFINE_IDTENTRY_RAW(exc_machine_check)
{
unsigned long dr7;
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 43c466020ed5..03e51053592a 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -345,7 +345,7 @@ static __init int dev_mcelog_init_device(void)
int err;
mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
- mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL);
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
if (!mcelog)
return -ENOMEM;
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 0593b192eb8f..7843ab3fde09 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -511,7 +511,7 @@ static void do_inject(void)
*/
if (inj_type == DFR_INT_INJ) {
i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ i_mce.status &= ~MCI_STATUS_UC;
}
/*
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index baec68b7e010..ec6f0415bc6d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -145,7 +145,6 @@ extern struct builtin_fw __end_builtin_fw[];
bool get_builtin_firmware(struct cpio_data *cd, const char *name)
{
-#ifdef CONFIG_FW_LOADER
struct builtin_fw *b_fw;
for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
@@ -155,7 +154,6 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
return true;
}
}
-#endif
return false;
}
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 12f967c6b603..6a9df71c1b9e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -981,10 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
- if (c->x86_vendor == X86_VENDOR_INTEL)
- c->x86_cache_mbm_width_offset = eax & 0xff;
- else
- c->x86_cache_mbm_width_offset = -1;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index f20a47d120b1..5ffa32256b3b 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -37,6 +37,7 @@
#define MBA_IS_LINEAR 0x4
#define MBA_MAX_MBPS U32_MAX
#define MAX_MBA_BW_AMD 0x800
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 23b4b61319d3..3f844f14fc0a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1117,6 +1117,7 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
_d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
_r_cdp = NULL;
+ _d_cdp = NULL;
ret = -EINVAL;
}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index 300e3fd5ade3..ec8064c0ae03 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -18,12 +18,6 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
-u32 get_umwait_control_msr(void)
-{
- return umwait_control_cached;
-}
-EXPORT_SYMBOL_GPL(get_umwait_control_msr);
-
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index df1358ba622b..05fa4ef63490 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 456511b2284e..48ce44576947 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -71,6 +71,22 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
+static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src,
+ unsigned int nbytes)
+{
+ if (!user_mode(regs))
+ return copy_from_kernel_nofault(buf, (u8 *)src, nbytes);
+
+ /*
+ * Make sure userspace isn't trying to trick us into dumping kernel
+ * memory by pointing the userspace instruction pointer at it.
+ */
+ if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX))
+ return -EINVAL;
+
+ return copy_from_user_nmi(buf, (void __user *)src, nbytes);
+}
+
/*
* There are a couple of reasons for the 2/3rd prologue, courtesy of Linus:
*
@@ -97,17 +113,8 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE)
u8 opcodes[OPCODE_BUFSIZE];
unsigned long prologue = regs->ip - PROLOGUE_SIZE;
- bool bad_ip;
-
- /*
- * Make sure userspace isn't trying to trick us into dumping kernel
- * memory by pointing the userspace instruction pointer at it.
- */
- bad_ip = user_mode(regs) &&
- __chk_range_not_ok(prologue, OPCODE_BUFSIZE, TASK_SIZE_MAX);
- if (bad_ip || probe_kernel_read(opcodes, (u8 *)prologue,
- OPCODE_BUFSIZE)) {
+ if (copy_code(regs, opcodes, prologue, sizeof(opcodes))) {
printk("%sCode: Bad RIP value.\n", loglvl);
} else {
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %"
@@ -126,15 +133,15 @@ void show_ip(struct pt_regs *regs, const char *loglvl)
show_opcodes(regs, loglvl);
}
-void show_iret_regs(struct pt_regs *regs)
+void show_iret_regs(struct pt_regs *regs, const char *log_lvl)
{
- show_ip(regs, KERN_DEFAULT);
- printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+ show_ip(regs, log_lvl);
+ printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
- bool partial)
+ bool partial, const char *log_lvl)
{
/*
* These on_stack() checks aren't strictly necessary: the unwind code
@@ -146,7 +153,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* they can be printed in the right context.
*/
if (!partial && on_stack(info, regs, sizeof(*regs))) {
- __show_regs(regs, SHOW_REGS_SHORT);
+ __show_regs(regs, SHOW_REGS_SHORT, log_lvl);
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
@@ -155,7 +162,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
- show_iret_regs(regs);
+ show_iret_regs(regs, log_lvl);
}
}
@@ -210,7 +217,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%s <%s>\n", log_lvl, stack_name);
if (regs)
- show_regs_if_on_stack(&stack_info, regs, partial);
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -271,7 +278,7 @@ next:
/* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state, &partial);
if (regs)
- show_regs_if_on_stack(&stack_info, regs, partial);
+ show_regs_if_on_stack(&stack_info, regs, partial, log_lvl);
}
if (stack_name)
@@ -345,7 +352,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
oops_exit();
/* Executive summary in case the oops scrolled away */
- __show_regs(&exec_summary_regs, SHOW_REGS_ALL);
+ __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT);
if (!signr)
return;
@@ -437,9 +444,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
void show_regs(struct pt_regs *regs)
{
+ enum show_regs_mode print_kernel_regs;
+
show_regs_print_info(KERN_DEFAULT);
- __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL);
+ print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL;
+ __show_regs(regs, print_kernel_regs, KERN_DEFAULT);
/*
* When in-kernel, we also print out the stack at the time of the fault..
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 06c818967bb6..eb86a2b831b1 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -82,6 +82,45 @@ bool irq_fpu_usable(void)
}
EXPORT_SYMBOL(irq_fpu_usable);
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+ if (likely(use_xsave())) {
+ copy_xregs_to_kernel(&fpu->state.xsave);
+
+ /*
+ * AVX512 state is tracked here because its use is
+ * known to slow the max clock speed of the core.
+ */
+ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+ fpu->avx512_timestamp = jiffies;
+ return 1;
+ }
+
+ if (likely(use_fxsr())) {
+ copy_fxregs_to_kernel(fpu);
+ return 1;
+ }
+
+ /*
+ * Legacy FPU register saving, FNSAVE always clears FPU registers,
+ * so we have to mark them inactive:
+ */
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+ return 0;
+}
+EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+
void kernel_fpu_begin(void)
{
preempt_disable();
@@ -101,6 +140,12 @@ void kernel_fpu_begin(void)
copy_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ ldmxcsr(MXCSR_DEFAULT);
+
+ if (boot_cpu_has(X86_FEATURE_FPU))
+ asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index bda2e5eaca0e..be2a68a09d19 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void)
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_dynamic());
+ }
}
static bool xfeature_enabled(enum xfeature xfeature)
@@ -486,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
return ebx;
}
-static int xfeature_size(int xfeature_nr)
+int xfeature_size(int xfeature_nr)
{
u32 eax, ebx, ecx, edx;
@@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr)
*/
if ((nr < XFEATURE_YMM) ||
(nr >= XFEATURE_MAX) ||
- (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
+ ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@@ -847,8 +850,10 @@ void fpu__resume_cpu(void)
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+ xfeatures_mask_dynamic());
+ }
}
/*
@@ -1074,7 +1079,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
copy_part(offsetof(struct fxregs_state, st_space), 128,
&xsave->i387.st_space, &kbuf, &offset_start, &count);
if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256,
+ copy_part(xstate_offsets[XFEATURE_SSE], 256,
&xsave->i387.xmm_space, &kbuf, &offset_start, &count);
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
@@ -1356,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate)
}
}
+/**
+ * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
+ * an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features saved into the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
+ * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * region and XSAVE header are also saved into the xsave area. The supervisor
+ * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+{
+ u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+ u32 lmask, hmask;
+ int err;
+
+ if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+ return;
+
+ if (WARN_ON_FPU(!dynamic_mask))
+ return;
+
+ lmask = dynamic_mask;
+ hmask = dynamic_mask >> 32;
+
+ XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+ /* Should never fault when copying to a kernel buffer */
+ WARN_ON_FPU(err);
+}
+
+/**
+ * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
+ * an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features restored from the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
+ * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * legacy region and XSAVE header are also restored from the xsave area. The
+ * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+{
+ u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+ u32 lmask, hmask;
+ int err;
+
+ if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+ return;
+
+ if (WARN_ON_FPU(!dynamic_mask))
+ return;
+
+ lmask = dynamic_mask;
+ hmask = dynamic_mask >> 32;
+
+ XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+
+ /* Should never fault when copying from a kernel buffer */
+ WARN_ON_FPU(err);
+}
+
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c84d28e90a58..51504566b3a6 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -86,7 +86,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code)
* sure what we read is what we expected it to be before modifying it.
*/
/* read the text we want to modify */
- if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ if (copy_from_kernel_nofault(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
WARN_ON(1);
return -EFAULT;
}
@@ -355,7 +355,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
- ret = probe_kernel_read(trampoline, (void *)start_offset, size);
+ ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0))
goto fail;
@@ -363,13 +363,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* The trampoline ends with ret(q) */
retq = (unsigned long)ftrace_stub;
- ret = probe_kernel_read(ip, (void *)retq, RET_SIZE);
+ ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
if (WARN_ON(ret < 0))
goto fail;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller);
- ret = probe_kernel_read(ip, (void *)retq, RET_SIZE);
+ ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
if (WARN_ON(ret < 0))
goto fail;
}
@@ -506,7 +506,7 @@ static void *addr_from_call(void *ptr)
union text_poke_insn call;
int ret;
- ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE);
+ ret = copy_from_kernel_nofault(&call, ptr, CALL_INSN_SIZE);
if (WARN_ON_ONCE(ret < 0))
return NULL;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index f3c76252247d..282b4ee1339f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -207,7 +207,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0db21206f2f3..7ecf9babf0cb 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = {
/* Must be page-aligned because the real IDT is used in the cpu entry area */
static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
-struct desc_ptr idt_descr __ro_after_init = {
+static struct desc_ptr idt_descr __ro_after_init = {
.size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index db6578d45157..57c2ecf43134 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -170,15 +170,6 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
if (!current_ei->efi_memmap_size)
return 0;
- /*
- * If 1:1 mapping is not enabled, second kernel can not setup EFI
- * and use EFI run time services. User space will have to pass
- * acpi_rsdp=<addr> on kernel command line to make second kernel boot
- * without efi.
- */
- if (efi_have_uv1_memmap())
- return 0;
-
params->secure_boot = boot_params.secure_boot;
ei->efi_loader_signature = current_ei->efi_loader_signature;
ei->efi_systab = current_ei->efi_systab;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index c44fe7d8d9a4..68acd30c6b87 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -732,11 +732,11 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
int err;
bpt->type = BP_BREAKPOINT;
- err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
BREAK_INSTR_SIZE);
if (err)
return err;
- err = probe_kernel_write((char *)bpt->bpt_addr,
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
if (!err)
return err;
@@ -768,7 +768,7 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
return 0;
knl_write:
- return probe_kernel_write((char *)bpt->bpt_addr,
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 3bafe1bd4dc7..fdadc37d72af 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -243,7 +244,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
* Fortunately, we know that the original code is the ideal 5-byte
* long NOP.
*/
- if (probe_kernel_read(buf, (void *)addr,
+ if (copy_from_kernel_nofault(buf, (void *)addr,
MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
return 0UL;
@@ -346,7 +347,8 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
return 0;
/* This can access kernel text if given address is not recovered */
- if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
+ if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
+ MAX_INSN_SIZE))
return 0;
kernel_insn_init(insn, dest, MAX_INSN_SIZE);
@@ -471,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p)
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
/* OK, write back the instruction(s) into ROX insn buffer */
text_poke(p->ainsn.insn, buf, len);
@@ -502,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
text_poke_sync();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
void arch_disarm_kprobe(struct kprobe *p)
{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
}
@@ -515,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}
@@ -753,16 +767,11 @@ asm(
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
-static struct kprobe kretprobe_kprobe = {
- .addr = (void *)kretprobe_trampoline,
-};
-
/*
* Called from kretprobe_trampoline
*/
__used __visible void *trampoline_handler(struct pt_regs *regs)
{
- struct kprobe_ctlblk *kcb;
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
@@ -772,16 +781,12 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
void *frame_pointer;
bool skipped = false;
- preempt_disable();
-
/*
* Set a dummy kprobe for avoiding kretprobe recursion.
* Since kretprobe never run in kprobe handler, kprobe must not
* be running at this point.
*/
- kcb = get_kprobe_ctlblk();
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ kprobe_busy_begin();
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -857,7 +862,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
__this_cpu_write(current_kprobe, &ri->rp->kp);
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
+ __this_cpu_write(current_kprobe, &kprobe_busy);
}
recycle_rp_inst(ri, &empty_rp);
@@ -873,8 +878,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
kretprobe_hash_unlock(current, &flags);
- __this_cpu_write(current_kprobe, NULL);
- preempt_enable();
+ kprobe_busy_end();
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 321c19950285..40f380461e6d 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -6,6 +6,7 @@
* Copyright (C) Hitachi Ltd., 2012
*/
#include <linux/kprobes.h>
+#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -56,7 +57,7 @@ found:
* overwritten by jump destination address. In this case, original
* bytes must be recovered from op->optinsn.copied_insn buffer.
*/
- if (probe_kernel_read(buf, (void *)addr,
+ if (copy_from_kernel_nofault(buf, (void *)addr,
MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
return 0UL;
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
op->optinsn.insn = NULL;
op->optinsn.size = 0;
}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
(u8 *)op->kp.addr + op->optinsn.size);
len += JMP32_INSN_SIZE;
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
/* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
text_poke(slot, buf, len);
+
ret = 0;
out:
kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
*/
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- arch_arm_kprobe(&op->kp);
- text_poke(op->kp.addr + INT3_INSN_SIZE,
- op->optinsn.copied_insn, DISP32_SIZE);
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ text_poke_sync();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df63786e7bfa..233c77d056c9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
u32 reason = kvm_read_and_reset_apf_flags();
- bool rcu_exit;
+ irqentry_state_t state;
switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
@@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
return false;
}
- rcu_exit = idtentry_enter_cond_rcu(regs);
+ state = irqentry_enter(regs);
instrumentation_begin();
/*
@@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
}
instrumentation_end();
- idtentry_exit_cond_rcu(regs, rcu_exit);
+ irqentry_exit(regs, state);
return true;
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 8748321c4486..b8aee71840ae 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -29,6 +29,8 @@
#include <asm/mmu_context.h>
#include <asm/pgtable_areas.h>
+#include <xen/xen.h>
+
/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
@@ -543,6 +545,28 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
return bytecount;
}
+static bool allow_16bit_segments(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_16BIT))
+ return false;
+
+#ifdef CONFIG_XEN_PV
+ /*
+ * Xen PV does not implement ESPFIX64, which means that 16-bit
+ * segments will not work correctly. Until either Xen PV implements
+ * ESPFIX64 and can signal this fact to the guest or unless someone
+ * provides compelling evidence that allowing broken 16-bit segments
+ * is worthwhile, disallow 16-bit segments under Xen PV.
+ */
+ if (xen_pv_domain()) {
+ pr_info_once("Warning: 16-bit segments do not work correctly in a Xen PV guest\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
@@ -574,7 +598,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
/* The user wants to clear the entry. */
memset(&ldt, 0, sizeof(ldt));
} else {
- if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ if (!ldt_info.seg_32bit && !allow_16bit_segments()) {
error = -EINVAL;
goto out;
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1547be359d7f..49dcfb85e773 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -42,6 +42,14 @@
static struct class *msr_class;
static enum cpuhp_state cpuhp_msr_state;
+enum allow_write_msrs {
+ MSR_WRITES_ON,
+ MSR_WRITES_OFF,
+ MSR_WRITES_DEFAULT,
+};
+
+static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
+
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf,
return bytes ? bytes : err;
}
+static int filter_write(u32 reg)
+{
+ switch (allow_writes) {
+ case MSR_WRITES_ON: return 0;
+ case MSR_WRITES_OFF: return -EPERM;
+ default: break;
+ }
+
+ if (reg == MSR_IA32_ENERGY_PERF_BIAS)
+ return 0;
+
+ pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n"
+ "Please report to x86@kernel.org\n",
+ reg, current->comm);
+
+ return 0;
+}
+
static ssize_t msr_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
if (err)
return err;
+ err = filter_write(reg);
+ if (err)
+ return err;
+
if (count % 8)
return -EINVAL; /* Invalid chunk size */
@@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
err = -EFAULT;
break;
}
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
if (err)
break;
+
tmp += 2;
bytes += 8;
}
@@ -242,6 +276,41 @@ static void __exit msr_exit(void)
}
module_exit(msr_exit)
+static int set_allow_writes(const char *val, const struct kernel_param *cp)
+{
+ /* val is NUL-terminated, see kernfs_fop_write() */
+ char *s = strstrip((char *)val);
+
+ if (!strcmp(s, "on"))
+ allow_writes = MSR_WRITES_ON;
+ else if (!strcmp(s, "off"))
+ allow_writes = MSR_WRITES_OFF;
+ else
+ allow_writes = MSR_WRITES_DEFAULT;
+
+ return 0;
+}
+
+static int get_allow_writes(char *buf, const struct kernel_param *kp)
+{
+ const char *res;
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: res = "on"; break;
+ case MSR_WRITES_OFF: res = "off"; break;
+ default: res = "default"; break;
+ }
+
+ return sprintf(buf, "%s\n", res);
+}
+
+static const struct kernel_param_ops allow_writes_ops = {
+ .set = set_allow_writes,
+ .get = get_allow_writes
+};
+
+module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600);
+
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic MSR driver");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2de365f15684..4fc9954a9560 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
instrumentation_begin();
- trace_hardirqs_off_finish();
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
@@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
unknown_nmi_error(reason, regs);
out:
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
}
@@ -478,7 +475,9 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
- if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
+ bool irq_state;
+
+ if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
@@ -491,14 +490,14 @@ nmi_restart:
this_cpu_write(nmi_dr7, local_db_save());
- nmi_enter();
+ irq_state = idtentry_enter_nmi(regs);
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
local_db_restore(this_cpu_read(nmi_dr7));
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 674a7d66d960..de2138ba38e5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -324,7 +324,8 @@ struct paravirt_patch_template pv_ops = {
.cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
- .cpu.update_io_bitmap = native_tss_update_io_bitmap,
+ .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
+ .cpu.update_io_bitmap = native_tss_update_io_bitmap,
#endif
.cpu.start_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index ee0286390a4c..9e1def3744f2 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -94,12 +94,12 @@ static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short
}
static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
- const unsigned char *rom_list)
+ const void *rom_list)
{
unsigned short device;
do {
- if (probe_kernel_address(rom_list, device) != 0)
+ if (get_kernel_nofault(device, rom_list) != 0)
device = 0;
if (device && match_id(pdev, vendor, device))
@@ -119,19 +119,19 @@ static struct resource *find_oprom(struct pci_dev *pdev)
for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
struct resource *res = &adapter_rom_resources[i];
unsigned short offset, vendor, device, list, rev;
- const unsigned char *rom;
+ const void *rom;
if (res->end == 0)
break;
rom = isa_bus_to_virt(res->start);
- if (probe_kernel_address(rom + 0x18, offset) != 0)
+ if (get_kernel_nofault(offset, rom + 0x18) != 0)
continue;
- if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+ if (get_kernel_nofault(vendor, rom + offset + 0x4) != 0)
continue;
- if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+ if (get_kernel_nofault(device, rom + offset + 0x6) != 0)
continue;
if (match_id(pdev, vendor, device)) {
@@ -139,8 +139,8 @@ static struct resource *find_oprom(struct pci_dev *pdev)
break;
}
- if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
- probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+ if (get_kernel_nofault(list, rom + offset + 0x8) == 0 &&
+ get_kernel_nofault(rev, rom + offset + 0xc) == 0 &&
rev >= 3 && list &&
probe_list(pdev, vendor, rom + offset + list)) {
oprom = res;
@@ -183,14 +183,14 @@ static int __init romsignature(const unsigned char *rom)
const unsigned short * const ptr = (const unsigned short *)rom;
unsigned short sig;
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+ return get_kernel_nofault(sig, ptr) == 0 && sig == ROMSIGNATURE;
}
static int __init romchecksum(const unsigned char *rom, unsigned long length)
{
unsigned char sum, c;
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ for (sum = 0; length && get_kernel_nofault(c, rom++) == 0; length--)
sum += c;
return !length && !sum;
}
@@ -211,7 +211,7 @@ void __init probe_roms(void)
video_rom_resource.start = start;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
@@ -249,7 +249,7 @@ void __init probe_roms(void)
if (!romsignature(rom))
continue;
- if (probe_kernel_address(rom + 2, c) != 0)
+ if (get_kernel_nofault(c, rom + 2) != 0)
continue;
/* 0 < length <= 0x7f * 512, historically */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f362ce0d5ac0..994d8393f2f7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -121,8 +121,8 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
-int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
- unsigned long arg, struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
+ struct task_struct *p, unsigned long tls)
{
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
@@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
- savesegment(gs, p->thread.gsindex);
- p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
- savesegment(fs, p->thread.fsindex);
- p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
+ current_save_fsgs();
+ p->thread.fsindex = current->thread.fsindex;
+ p->thread.fsbase = current->thread.fsbase;
+ p->thread.gsindex = current->thread.gsindex;
+ p->thread.gsbase = current->thread.gsbase;
+
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
#else
@@ -322,20 +324,6 @@ void arch_setup_new_exec(void)
}
#ifdef CONFIG_X86_IOPL_IOPERM
-static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
-{
- /*
- * Invalidate the I/O bitmap by moving io_bitmap_base outside the
- * TSS limit so any subsequent I/O access from user space will
- * trigger a #GP.
- *
- * This is correct even when VMEXIT rewrites the TSS limit
- * to 0x67 as the only requirement is that the base points
- * outside the limit.
- */
- tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
-}
-
static inline void switch_to_bitmap(unsigned long tifp)
{
/*
@@ -346,7 +334,7 @@ static inline void switch_to_bitmap(unsigned long tifp)
* user mode.
*/
if (tifp & _TIF_IO_BITMAP)
- tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw));
+ tss_invalidate_io_bitmap();
}
static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
@@ -380,7 +368,7 @@ void native_tss_update_io_bitmap(void)
u16 *base = &tss->x86_tss.io_bitmap_base;
if (!test_thread_flag(TIF_IO_BITMAP)) {
- tss_invalidate_io_bitmap(tss);
+ native_tss_invalidate_io_bitmap();
return;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index acfd6d2a0cbf..4f2f54e1281c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,8 @@
#include "process.h"
-void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
else
savesegment(gs, gs);
- show_ip(regs, KERN_DEFAULT);
+ show_ip(regs, log_lvl);
- printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
- regs->ax, regs->bx, regs->cx, regs->dx);
- printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
- regs->si, regs->di, regs->bp, regs->sp);
- printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
- (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
+ printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx, regs->dx);
+ printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
+ log_lvl, regs->si, regs->di, regs->bp, regs->sp);
+ printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n",
+ log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags);
if (mode != SHOW_REGS_ALL)
return;
@@ -83,8 +84,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
cr2 = read_cr2();
cr3 = __read_cr3();
cr4 = __read_cr4();
- printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
- cr0, cr2, cr3, cr4);
+ printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
+ log_lvl, cr0, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
@@ -98,10 +99,10 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
(d6 == DR6_RESERVED) && (d7 == 0x400))
return;
- printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
- d0, d1, d2, d3);
- printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
- d6, d7);
+ printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
+ log_lvl, d0, d1, d2, d3);
+ printk("%sDR6: %08lx DR7: %08lx\n",
+ log_lvl, d6, d7);
}
void release_thread(struct task_struct *dead_task)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9a97415b2139..d6f946707270 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -62,30 +62,31 @@
#include "process.h"
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
+void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
+ const char *log_lvl)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
unsigned int fsindex, gsindex;
unsigned int ds, es;
- show_iret_regs(regs);
+ show_iret_regs(regs, log_lvl);
if (regs->orig_ax != -1)
pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
else
pr_cont("\n");
- printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
- regs->ax, regs->bx, regs->cx);
- printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
- regs->dx, regs->si, regs->di);
- printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
- regs->bp, regs->r8, regs->r9);
- printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
+ printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
+ log_lvl, regs->ax, regs->bx, regs->cx);
+ printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
+ log_lvl, regs->dx, regs->si, regs->di);
+ printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
+ log_lvl, regs->bp, regs->r8, regs->r9);
+ printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
+ log_lvl, regs->r10, regs->r11, regs->r12);
+ printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
+ log_lvl, regs->r13, regs->r14, regs->r15);
if (mode == SHOW_REGS_SHORT)
return;
@@ -93,8 +94,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
if (mode == SHOW_REGS_USER) {
rdmsrl(MSR_FS_BASE, fs);
rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
- printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n",
- fs, shadowgs);
+ printk("%sFS: %016lx GS: %016lx\n",
+ log_lvl, fs, shadowgs);
return;
}
@@ -112,12 +113,12 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
cr3 = __read_cr3();
cr4 = __read_cr4();
- printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs, fsindex, gs, gsindex, shadowgs);
- printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
- es, cr0);
- printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
- cr4);
+ printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ log_lvl, fs, fsindex, gs, gsindex, shadowgs);
+ printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n",
+ log_lvl, regs->cs, ds, es, cr0);
+ printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
+ log_lvl, cr2, cr3, cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
@@ -129,14 +130,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
/* Only print out debug registers if they are in their non-default state. */
if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
(d6 == DR6_RESERVED) && (d7 == 0x400))) {
- printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
- d0, d1, d2);
- printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
- d3, d6, d7);
+ printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
+ log_lvl, d0, d1, d2);
+ printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
+ log_lvl, d3, d6, d7);
}
if (boot_cpu_has(X86_FEATURE_OSPKE))
- printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
+ printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
void release_thread(struct task_struct *dead_task)
@@ -150,6 +151,56 @@ enum which_selector {
};
/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr unsigned long __rdgsbase_inactive(void)
+{
+ unsigned long gsbase;
+
+ lockdep_assert_irqs_disabled();
+
+ if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ gsbase = rdgsbase();
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
+
+ return gsbase;
+}
+
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr void __wrgsbase_inactive(unsigned long gsbase)
+{
+ lockdep_assert_irqs_disabled();
+
+ if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ native_swapgs();
+ wrgsbase(gsbase);
+ native_swapgs();
+ } else {
+ instrumentation_begin();
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ instrumentation_end();
+ }
+}
+
+/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
@@ -198,22 +249,35 @@ static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
- save_base_legacy(task, task->thread.fsindex, FS);
- save_base_legacy(task, task->thread.gsindex, GS);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /*
+ * If FSGSBASE is enabled, we can't make any useful guesses
+ * about the base, and user code expects us to save the current
+ * value. Fortunately, reading the base directly is efficient.
+ */
+ task->thread.fsbase = rdfsbase();
+ task->thread.gsbase = __rdgsbase_inactive();
+ } else {
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+ }
}
-#if IS_ENABLED(CONFIG_KVM)
/*
* While a process is running,current->thread.fsbase and current->thread.gsbase
- * may not match the corresponding CPU registers (see save_base_legacy()). KVM
- * wants an efficient way to save and restore FSBASE and GSBASE.
- * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ * may not match the corresponding CPU registers (see save_base_legacy()).
*/
-void save_fsgs_for_kvm(void)
+void current_save_fsgs(void)
{
+ unsigned long flags;
+
+ /* Interrupts need to be off for FSGSBASE */
+ local_irq_save(flags);
save_fsgs(current);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#if IS_ENABLED(CONFIG_KVM)
+EXPORT_SYMBOL_GPL(current_save_fsgs);
#endif
static __always_inline void loadseg(enum which_selector which,
@@ -278,14 +342,26 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
- load_seg_legacy(prev->fsindex, prev->fsbase,
- next->fsindex, next->fsbase, FS);
- load_seg_legacy(prev->gsindex, prev->gsbase,
- next->gsindex, next->gsbase, GS);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /* Update the FS and GS selectors if they could have changed. */
+ if (unlikely(prev->fsindex || next->fsindex))
+ loadseg(FS, next->fsindex);
+ if (unlikely(prev->gsindex || next->gsindex))
+ loadseg(GS, next->gsindex);
+
+ /* Update the bases. */
+ wrfsbase(next->fsbase);
+ __wrgsbase_inactive(next->gsbase);
+ } else {
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+ }
}
-static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
- unsigned short selector)
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+ unsigned short selector)
{
unsigned short idx = selector >> 3;
unsigned long base;
@@ -327,13 +403,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
return base;
}
+unsigned long x86_gsbase_read_cpu_inactive(void)
+{
+ unsigned long gsbase;
+
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ gsbase = __rdgsbase_inactive();
+ local_irq_restore(flags);
+ } else {
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ }
+
+ return gsbase;
+}
+
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wrgsbase_inactive(gsbase);
+ local_irq_restore(flags);
+ } else {
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ }
+}
+
unsigned long x86_fsbase_read_task(struct task_struct *task)
{
unsigned long fsbase;
if (task == current)
fsbase = x86_fsbase_read_cpu();
- else if (task->thread.fsindex == 0)
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.fsindex == 0))
fsbase = task->thread.fsbase;
else
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
@@ -347,7 +454,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
if (task == current)
gsbase = x86_gsbase_read_cpu_inactive();
- else if (task->thread.gsindex == 0)
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.gsindex == 0))
gsbase = task->thread.gsbase;
else
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 44130588987f..3f006489087f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -281,17 +281,9 @@ static int set_segment_reg(struct task_struct *task,
return -EIO;
/*
- * This function has some ABI oddities.
- *
- * A 32-bit ptracer probably expects that writing FS or GS will change
- * FSBASE or GSBASE respectively. In the absence of FSGSBASE support,
- * this code indeed has that effect. When FSGSBASE is added, this
- * will require a special case.
- *
- * For existing 64-bit ptracers, writing FS or GS *also* currently
- * changes the base if the selector is nonzero the next time the task
- * is run. This behavior may not be needed, and trying to preserve it
- * when FSGSBASE is added would be complicated at best.
+ * Writes to FS and GS will change the stored selector. Whether
+ * this changes the segment base as well depends on whether
+ * FSGSBASE is enabled.
*/
switch (offset) {
@@ -379,25 +371,12 @@ static int putreg(struct task_struct *child,
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_MAX)
return -EIO;
- /*
- * When changing the FS base, use do_arch_prctl_64()
- * to set the index to zero and to set the base
- * as requested.
- *
- * NB: This behavior is nonsensical and likely needs to
- * change when FSGSBASE support is added.
- */
- if (child->thread.fsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
if (value >= TASK_SIZE_MAX)
return -EIO;
- if (child->thread.gsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ x86_gsbase_write_task(child, value);
return 0;
#endif
}
@@ -880,14 +859,39 @@ long arch_ptrace(struct task_struct *child, long request,
static int putreg32(struct task_struct *child, unsigned regno, u32 value)
{
struct pt_regs *regs = task_pt_regs(child);
+ int ret;
switch (regno) {
SEG32(cs);
SEG32(ds);
SEG32(es);
- SEG32(fs);
- SEG32(gs);
+
+ /*
+ * A 32-bit ptracer on a 64-bit kernel expects that writing
+ * FS or GS will also update the base. This is needed for
+ * operations like PTRACE_SETREGS to fully restore a saved
+ * CPU state.
+ */
+
+ case offsetof(struct user32, regs.fs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, fs),
+ value);
+ if (ret == 0)
+ child->thread.fsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
+ case offsetof(struct user32, regs.gs):
+ ret = set_segment_reg(child,
+ offsetof(struct user_regs_struct, gs),
+ value);
+ if (ret == 0)
+ child->thread.gsbase =
+ x86_fsgsbase_read_task(child, value);
+ return ret;
+
SEG32(ss);
R32(ebx, bx);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 896d74cb5081..1b10717c9321 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void)
static void ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(rcba);
+ u32 rcba;
int err = 0;
if (hpet_address || force_hpet_address)
@@ -185,7 +185,7 @@ static void hpet_print_force_info(void)
static void old_ich_force_hpet_resume(void)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (!force_hpet_address || !cached_dev)
return;
@@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void)
static void old_ich_force_enable_hpet(struct pci_dev *dev)
{
u32 val;
- u32 uninitialized_var(gen_cntl);
+ u32 gen_cntl;
if (hpet_address || force_hpet_address)
return;
@@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void)
static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
@@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void)
static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
- u32 uninitialized_var(val);
+ u32 val;
if (hpet_address || force_hpet_address)
return;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 399f97abee02..d5fa494c2304 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -25,6 +25,7 @@
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>
#include <linux/context_tracking.h>
+#include <linux/entry-common.h>
#include <linux/syscalls.h>
#include <asm/processor.h>
@@ -803,7 +804,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-void do_signal(struct pt_regs *regs)
+void arch_do_signal(struct pt_regs *regs)
{
struct ksignal ksig;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ffbd9a3d78d8..27aa04a95702 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -51,11 +51,11 @@
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
-#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
+#include <linux/overflow.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -80,6 +80,7 @@
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
+#include <asm/stackprotector.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -259,21 +260,10 @@ static void notrace start_secondary(void *unused)
/* enable local interrupts */
local_irq_enable();
- /* to prevent fake stack check failure in clock setup */
- boot_init_stack_canary();
-
x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-
- /*
- * Prevent tail call to cpu_startup_entry() because the stack protector
- * guard has been changed a couple of function calls up, in
- * boot_init_stack_canary() and must not be checked before tail calling
- * another function.
- */
- prevent_tail_call_optimization();
}
/**
@@ -1011,6 +1001,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
alternatives_enable_smp();
per_cpu(current_task, cpu) = idle;
+ cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
ret = irq_init_percpu_irqstack(cpu);
@@ -1777,6 +1768,7 @@ void native_play_dead(void)
#endif
+#ifdef CONFIG_X86_64
/*
* APERF/MPERF frequency ratio computation.
*
@@ -1975,6 +1967,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
static bool intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
@@ -2000,15 +1993,23 @@ out:
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (!base_freq) {
- pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
return false;
}
- arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
- base_freq);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
arch_set_max_freq_ratio(turbo_disabled());
+
return true;
}
@@ -2048,11 +2049,19 @@ static void init_freq_invariance(bool secondary)
}
}
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
- u64 freq_scale;
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
u64 aperf, mperf;
u64 acnt, mcnt;
@@ -2064,19 +2073,32 @@ void arch_scale_freq_tick(void)
acnt = aperf - this_cpu_read(arch_prev_aperf);
mcnt = mperf - this_cpu_read(arch_prev_mperf);
- if (!mcnt)
- return;
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
- acnt <<= 2*SCHED_CAPACITY_SHIFT;
- mcnt *= arch_max_freq_ratio;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void init_freq_invariance(bool secondary)
+{
}
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6ad43fc44556..2fd698e28e4d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
* or a page fault), which can make frame pointers
* unreliable.
*/
-
if (IS_ENABLED(CONFIG_FRAME_POINTER))
return -EINVAL;
}
@@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
if (unwind_error(&state))
return -EINVAL;
- /* Success path for non-user tasks, i.e. kthreads and idle tasks */
- if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
- return -EINVAL;
-
return 0;
}
diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c
index f8d65c99feb8..720cde885042 100644
--- a/arch/x86/kernel/sys_ia32.c
+++ b/arch/x86/kernel/sys_ia32.c
@@ -251,9 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags,
.tls = tls_val,
};
- if (!legacy_clone_args_valid(&args))
- return -EINVAL;
-
return _do_fork(&args);
}
#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index af75109485c2..438fc554d48d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -84,17 +84,16 @@ static inline void cond_local_irq_disable(struct pt_regs *regs)
local_irq_disable();
}
-int is_valid_bugaddr(unsigned long addr)
+__always_inline int is_valid_bugaddr(unsigned long addr)
{
- unsigned short ud;
-
if (addr < TASK_SIZE_MAX)
return 0;
- if (probe_kernel_address((unsigned short *)addr, ud))
- return 0;
-
- return ud == INSN_UD0 || ud == INSN_UD2;
+ /*
+ * We got #UD, if the text isn't readable we'd have gotten
+ * a different exception.
+ */
+ return *(unsigned short *)addr == INSN_UD2;
}
static nokprobe_inline int
@@ -216,46 +215,51 @@ static inline void handle_invalid_op(struct pt_regs *regs)
ILL_ILLOPN, error_get_trap_addr(regs));
}
-DEFINE_IDTENTRY_RAW(exc_invalid_op)
+static noinstr bool handle_bug(struct pt_regs *regs)
{
- bool rcu_exit;
+ bool handled = false;
+
+ if (!is_valid_bugaddr(regs->ip))
+ return handled;
/*
- * Handle BUG/WARN like NMIs instead of like normal idtentries:
- * if we bugged/warned in a bad RCU context, for example, the last
- * thing we want is to BUG/WARN again in the idtentry code, ad
- * infinitum.
+ * All lies, just get the WARN/BUG out.
+ */
+ instrumentation_begin();
+ /*
+ * Since we're emulating a CALL with exceptions, restore the interrupt
+ * state to what it was at the exception site.
*/
- if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) {
- enum bug_trap_type type;
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_enable();
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ regs->ip += LEN_UD2;
+ handled = true;
+ }
+ if (regs->flags & X86_EFLAGS_IF)
+ raw_local_irq_disable();
+ instrumentation_end();
- nmi_enter();
- instrumentation_begin();
- trace_hardirqs_off_finish();
- type = report_bug(regs->ip, regs);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
- instrumentation_end();
- nmi_exit();
+ return handled;
+}
- if (type == BUG_TRAP_TYPE_WARN) {
- /* Skip the ud2. */
- regs->ip += LEN_UD2;
- return;
- }
+DEFINE_IDTENTRY_RAW(exc_invalid_op)
+{
+ irqentry_state_t state;
- /*
- * Else, if this was a BUG and report_bug returns or if this
- * was just a normal #UD, we want to continue onward and
- * crash.
- */
- }
+ /*
+ * We use UD2 as a short encoding for 'CALL __WARN', as such
+ * handle it before exception entry to avoid recursive WARN
+ * in case exception entry is the one triggering WARNs.
+ */
+ if (!user_mode(regs) && handle_bug(regs))
+ return;
- rcu_exit = idtentry_enter_cond_rcu(regs);
+ state = irqentry_enter(regs);
instrumentation_begin();
handle_invalid_op(regs);
instrumentation_end();
- idtentry_exit_cond_rcu(regs, rcu_exit);
+ irqentry_exit(regs, state);
}
DEFINE_IDTENTRY(exc_coproc_segment_overrun)
@@ -299,6 +303,8 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
error_code, BUS_ADRALN, NULL);
+
+ local_irq_disable();
}
#ifdef CONFIG_VMAP_STACK
@@ -399,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif
- nmi_enter();
+ idtentry_enter_nmi(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -488,7 +494,8 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
u8 insn_buf[MAX_INSN_SIZE];
struct insn insn;
- if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
+ MAX_INSN_SIZE))
return GP_NO_HINT;
kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
@@ -631,28 +638,25 @@ DEFINE_IDTENTRY_RAW(exc_int3)
return;
/*
- * idtentry_enter_user() uses static_branch_{,un}likely() and therefore
- * can trigger INT3, hence poke_int3_handler() must be done
- * before. If the entry came from kernel mode, then use nmi_enter()
- * because the INT3 could have been hit in any context including
- * NMI.
+ * irqentry_enter_from_user_mode() uses static_branch_{,un}likely()
+ * and therefore can trigger INT3, hence poke_int3_handler() must
+ * be done before. If the entry came from kernel mode, then use
+ * nmi_enter() because the INT3 could have been hit in any context
+ * including NMI.
*/
if (user_mode(regs)) {
- idtentry_enter_user(regs);
+ irqentry_enter_from_user_mode(regs);
instrumentation_begin();
do_int3_user(regs);
instrumentation_end();
- idtentry_exit_user(regs);
+ irqentry_exit_to_user_mode(regs);
} else {
- nmi_enter();
+ bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
- trace_hardirqs_off_finish();
if (!do_int3(regs))
die("int3", regs, 0);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
}
}
@@ -690,13 +694,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
(struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the temporary storage. */
- memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
+ __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
- memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
+ __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
/* Update the entry stack */
- memcpy(new_stack, &tmp, sizeof(tmp));
+ __memcpy(new_stack, &tmp, sizeof(tmp));
BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
@@ -860,9 +864,14 @@ out:
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
- nmi_enter();
+ bool irq_state = idtentry_enter_nmi(regs);
instrumentation_begin();
- trace_hardirqs_off_finish();
+
+ /*
+ * If something gets miswired and we end up here for a user mode
+ * #DB, we will malfunction.
+ */
+ WARN_ON_ONCE(user_mode(regs));
/*
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
@@ -873,21 +882,26 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
handle_debug(regs, dr6, false);
- if (regs->flags & X86_EFLAGS_IF)
- trace_hardirqs_on_prepare();
instrumentation_end();
- nmi_exit();
+ idtentry_exit_nmi(regs, irq_state);
}
static __always_inline void exc_debug_user(struct pt_regs *regs,
unsigned long dr6)
{
- idtentry_enter_user(regs);
+ /*
+ * If something gets miswired and we end up here for a kernel mode
+ * #DB, we will malfunction.
+ */
+ WARN_ON_ONCE(!user_mode(regs));
+
+ irqentry_enter_from_user_mode(regs);
instrumentation_begin();
handle_debug(regs, dr6, true);
+
instrumentation_end();
- idtentry_exit_user(regs);
+ irqentry_exit_to_user_mode(regs);
}
#ifdef CONFIG_X86_64
@@ -912,7 +926,7 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
}
#else
/* 32 bit does not have separate entry points. */
-DEFINE_IDTENTRY_DEBUG(exc_debug)
+DEFINE_IDTENTRY_RAW(exc_debug)
{
unsigned long dr6, dr7;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 722a85f3b2dd..d7c44b257f7f 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -269,13 +269,13 @@ bool unwind_next_frame(struct unwind_state *state)
/*
* kthreads (other than the boot CPU's idle thread) have some
* partial regs at the end of their stack which were placed
- * there by copy_thread_tls(). But the regs don't have any
+ * there by copy_thread(). But the regs don't have any
* useful information, so we can skip them.
*
* This user_mode() check is slightly broader than a PF_KTHREAD
* check because it also catches the awkward situation where a
* newly forked kthread transitions into a user task by calling
- * do_execve(), which eventually clears PF_KTHREAD.
+ * kernel_execve(), which eventually clears PF_KTHREAD.
*/
if (!user_mode(regs))
goto the_end;
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 7f969b2d240f..ec88bbe08a32 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -440,8 +440,11 @@ bool unwind_next_frame(struct unwind_state *state)
/*
* Find the orc_entry associated with the text address.
*
- * Decrement call return addresses by one so they work for sibling
- * calls and calls to noreturn functions.
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
if (!orc) {
@@ -662,6 +665,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
state->sp = task->thread.sp;
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork;
}
if (get_stack_info((unsigned long *)state->sp, state->task,
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3bfc8dd8a43d..9a03e5b23135 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -358,6 +358,7 @@ SECTIONS
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
*(.bss..page_aligned)
+ . = ALIGN(PAGE_SIZE);
*(BSS_MAIN)
BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);