aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig167
-rw-r--r--arch/x86/Kconfig.assembler5
-rw-r--r--arch/x86/Kconfig.debug2
-rw-r--r--arch/x86/Makefile28
-rw-r--r--arch/x86/Makefile.um13
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/bioscall.S4
-rw-r--r--arch/x86/boot/compressed/Makefile12
-rw-r--r--arch/x86/boot/compressed/efi_mixed.S345
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S195
-rw-r--r--arch/x86/boot/compressed/head_32.S6
-rw-r--r--arch/x86/boot/compressed/head_64.S313
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c14
-rw-r--r--arch/x86/boot/compressed/kaslr.c2
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S152
-rw-r--r--arch/x86/boot/compressed/misc.c18
-rw-r--r--arch/x86/boot/compressed/misc.h11
-rw-r--r--arch/x86/boot/compressed/sev.c72
-rw-r--r--arch/x86/boot/compressed/tdx.c4
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S1
-rw-r--r--arch/x86/boot/cpuflags.c15
-rw-r--r--arch/x86/boot/header.S7
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/tools/build.c2
-rw-r--r--arch/x86/coco/core.c53
-rw-r--r--arch/x86/coco/tdx/tdcall.S146
-rw-r--r--arch/x86/coco/tdx/tdx.c167
-rw-r--r--arch/x86/crypto/Kconfig38
-rw-r--r--arch/x86/crypto/Makefile9
-rw-r--r--arch/x86/crypto/aegis128-aesni-asm.S15
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S198
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S254
-rw-r--r--arch/x86/crypto/aria-aesni-avx-asm_64.S203
-rw-r--r--arch/x86/crypto/aria-aesni-avx2-asm_64.S1441
-rw-r--r--arch/x86/crypto/aria-avx.h48
-rw-r--r--arch/x86/crypto/aria-gfni-avx512-asm_64.S971
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c254
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c49
-rw-r--r--arch/x86/crypto/aria_gfni_avx512_glue.c250
-rw-r--r--arch/x86/crypto/blake2s-glue.c5
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S71
-rw-r--r--arch/x86/crypto/blowfish_glue.c200
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S32
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S34
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S6
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S40
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S32
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S16
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S70
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S1
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S96
-rw-r--r--arch/x86/crypto/ecb_cbc_helpers.h19
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c45
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S5
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S5
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c11
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c11
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl1
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S2
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S2
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S25
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S4
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S3
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S20
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S58
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S20
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S4
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S11
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S19
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S11
-rw-r--r--arch/x86/crypto/sm3-avx-asm_64.S4
-rw-r--r--arch/x86/crypto/sm4-aesni-avx-asm_64.S14
-rw-r--r--arch/x86/crypto/sm4-aesni-avx2-asm_64.S13
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S2
-rw-r--r--arch/x86/crypto/twofish_glue.c2
-rw-r--r--arch/x86/entry/entry_32.S4
-rw-r--r--arch/x86/entry/entry_64.S87
-rw-r--r--arch/x86/entry/entry_64_compat.S11
-rw-r--r--arch/x86/entry/thunk_64.S4
-rw-r--r--arch/x86/entry/vdso/Makefile22
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h6
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c20
-rw-r--r--arch/x86/entry/vdso/vdso32/fake_32bit_build.h25
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c27
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S1
-rw-r--r--arch/x86/entry/vdso/vdso32/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c3
-rw-r--r--arch/x86/entry/vdso/vma.c55
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c4
-rw-r--r--arch/x86/events/amd/brs.c15
-rw-r--r--arch/x86/events/amd/core.c16
-rw-r--r--arch/x86/events/amd/ibs.c13
-rw-r--r--arch/x86/events/amd/lbr.c6
-rw-r--r--arch/x86/events/amd/uncore.c1
-rw-r--r--arch/x86/events/core.c66
-rw-r--r--arch/x86/events/intel/core.c257
-rw-r--r--arch/x86/events/intel/cstate.c24
-rw-r--r--arch/x86/events/intel/ds.c197
-rw-r--r--arch/x86/events/intel/lbr.c40
-rw-r--r--arch/x86/events/intel/p4.c2
-rw-r--r--arch/x86/events/intel/pt.c9
-rw-r--r--arch/x86/events/intel/uncore.c42
-rw-r--r--arch/x86/events/intel/uncore.h29
-rw-r--r--arch/x86/events/intel/uncore_discovery.c60
-rw-r--r--arch/x86/events/intel/uncore_discovery.h14
-rw-r--r--arch/x86/events/intel/uncore_snb.c164
-rw-r--r--arch/x86/events/intel/uncore_snbep.c652
-rw-r--r--arch/x86/events/msr.c5
-rw-r--r--arch/x86/events/perf_event.h54
-rw-r--r--arch/x86/events/rapl.c5
-rw-r--r--arch/x86/events/zhaoxin/core.c8
-rw-r--r--arch/x86/hyperv/Makefile1
-rw-r--r--arch/x86/hyperv/hv_apic.c12
-rw-r--r--arch/x86/hyperv/hv_init.c78
-rw-r--r--arch/x86/hyperv/hv_vtl.c229
-rw-r--r--arch/x86/hyperv/ivm.c150
-rw-r--r--arch/x86/hyperv/mmu.c11
-rw-r--r--arch/x86/ia32/Makefile2
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/acpi.h8
-rw-r--r--arch/x86/include/asm/agp.h6
-rw-r--r--arch/x86/include/asm/alternative.h192
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h44
-rw-r--r--arch/x86/include/asm/atomic64_64.h36
-rw-r--r--arch/x86/include/asm/bootparam_utils.h2
-rw-r--r--arch/x86/include/asm/cacheinfo.h13
-rw-r--r--arch/x86/include/asm/checksum_64.h1
-rw-r--r--arch/x86/include/asm/cmpxchg.h6
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h28
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h5
-rw-r--r--arch/x86/include/asm/coco.h24
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h4
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/cpufeatures.h35
-rw-r--r--arch/x86/include/asm/cpuid.h141
-rw-r--r--arch/x86/include/asm/current.h32
-rw-r--r--arch/x86/include/asm/debugreg.h37
-rw-r--r--arch/x86/include/asm/disabled-features.h28
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/efi.h123
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/entry-common.h4
-rw-r--r--arch/x86/include/asm/fpu/sched.h2
-rw-r--r--arch/x86/include/asm/fpu/signal.h7
-rw-r--r--arch/x86/include/asm/fpu/types.h2
-rw-r--r--arch/x86/include/asm/fpu/xcr.h4
-rw-r--r--arch/x86/include/asm/ftrace.h49
-rw-r--r--arch/x86/include/asm/gsseg.h66
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h146
-rw-r--r--arch/x86/include/asm/hyperv_timer.h9
-rw-r--r--arch/x86/include/asm/ibt.h4
-rw-r--r--arch/x86/include/asm/idtentry.h16
-rw-r--r--arch/x86/include/asm/insn-eval.h18
-rw-r--r--arch/x86/include/asm/intel-family.h4
-rw-r--r--arch/x86/include/asm/intel-mid.h21
-rw-r--r--arch/x86/include/asm/irq_remapping.h4
-rw-r--r--arch/x86/include/asm/irq_stack.h12
-rw-r--r--arch/x86/include/asm/irqdomain.h4
-rw-r--r--arch/x86/include/asm/irqflags.h11
-rw-r--r--arch/x86/include/asm/kasan.h3
-rw-r--r--arch/x86/include/asm/kexec.h3
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h12
-rw-r--r--arch/x86/include/asm/kvm_host.h340
-rw-r--r--arch/x86/include/asm/kvmclock.h2
-rw-r--r--arch/x86/include/asm/linkage.h65
-rw-r--r--arch/x86/include/asm/local.h13
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/memtype.h5
-rw-r--r--arch/x86/include/asm/microcode.h8
-rw-r--r--arch/x86/include/asm/microcode_amd.h4
-rw-r--r--arch/x86/include/asm/microcode_intel.h5
-rw-r--r--arch/x86/include/asm/mmu.h18
-rw-r--r--arch/x86/include/asm/mmu_context.h61
-rw-r--r--arch/x86/include/asm/mshyperv.h113
-rw-r--r--arch/x86/include/asm/msi.h6
-rw-r--r--arch/x86/include/asm/msr-index.h63
-rw-r--r--arch/x86/include/asm/mtrr.h16
-rw-r--r--arch/x86/include/asm/mwait.h14
-rw-r--r--arch/x86/include/asm/nospec-branch.h194
-rw-r--r--arch/x86/include/asm/orc_header.h19
-rw-r--r--arch/x86/include/asm/orc_types.h16
-rw-r--r--arch/x86/include/asm/page.h5
-rw-r--r--arch/x86/include/asm/page_32.h4
-rw-r--r--arch/x86/include/asm/page_64.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h2
-rw-r--r--arch/x86/include/asm/page_types.h12
-rw-r--r--arch/x86/include/asm/paravirt.h39
-rw-r--r--arch/x86/include/asm/paravirt_types.h104
-rw-r--r--arch/x86/include/asm/pci.h5
-rw-r--r--arch/x86/include/asm/perf_event.h19
-rw-r--r--arch/x86/include/asm/pgtable-2level.h26
-rw-r--r--arch/x86/include/asm/pgtable-3level.h197
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h7
-rw-r--r--arch/x86/include/asm/pgtable.h56
-rw-r--r--arch/x86/include/asm/pgtable_32.h9
-rw-r--r--arch/x86/include/asm/pgtable_64.h1
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h3
-rw-r--r--arch/x86/include/asm/pgtable_areas.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/preempt.h27
-rw-r--r--arch/x86/include/asm/processor-flags.h4
-rw-r--r--arch/x86/include/asm/processor.h154
-rw-r--r--arch/x86/include/asm/pvclock.h3
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h47
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/include/asm/reboot.h3
-rw-r--r--arch/x86/include/asm/required-features.h3
-rw-r--r--arch/x86/include/asm/resctrl.h20
-rw-r--r--arch/x86/include/asm/segment.h10
-rw-r--r--arch/x86/include/asm/set_memory.h3
-rw-r--r--arch/x86/include/asm/setup.h6
-rw-r--r--arch/x86/include/asm/sev-common.h3
-rw-r--r--arch/x86/include/asm/sev.h10
-rw-r--r--arch/x86/include/asm/sgx.h33
-rw-r--r--arch/x86/include/asm/shared/io.h4
-rw-r--r--arch/x86/include/asm/shared/tdx.h12
-rw-r--r--arch/x86/include/asm/sighandling.h9
-rw-r--r--arch/x86/include/asm/signal.h5
-rw-r--r--arch/x86/include/asm/smp.h24
-rw-r--r--arch/x86/include/asm/special_insns.h29
-rw-r--r--arch/x86/include/asm/stackprotector.h14
-rw-r--r--arch/x86/include/asm/string_64.h42
-rw-r--r--arch/x86/include/asm/svm.h122
-rw-r--r--arch/x86/include/asm/switch_to.h7
-rw-r--r--arch/x86/include/asm/tdx.h2
-rw-r--r--arch/x86/include/asm/text-patching.h32
-rw-r--r--arch/x86/include/asm/thread_info.h5
-rw-r--r--arch/x86/include/asm/time.h1
-rw-r--r--arch/x86/include/asm/tlbflush.h48
-rw-r--r--arch/x86/include/asm/uaccess.h42
-rw-r--r--arch/x86/include/asm/uaccess_32.h3
-rw-r--r--arch/x86/include/asm/uaccess_64.h147
-rw-r--r--arch/x86/include/asm/unwind_hints.h26
-rw-r--r--arch/x86/include/asm/vdso.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h2
-rw-r--r--arch/x86/include/asm/vdso/processor.h4
-rw-r--r--arch/x86/include/asm/virtext.h16
-rw-r--r--arch/x86/include/asm/vmx.h2
-rw-r--r--arch/x86/include/asm/x86_init.h10
-rw-r--r--arch/x86/include/asm/xen/cpuid.h22
-rw-r--r--arch/x86/include/asm/xen/hypercall.h2
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h50
-rw-r--r--arch/x86/include/uapi/asm/prctl.h8
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h6
-rw-r--r--arch/x86/include/uapi/asm/svm.h6
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c49
-rw-r--r--arch/x86/kernel/acpi/cstate.c24
-rw-r--r--arch/x86/kernel/acpi/sleep.c23
-rw-r--r--arch/x86/kernel/alternative.c605
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c2
-rw-r--r--arch/x86/kernel/apic/apic.c18
-rw-r--r--arch/x86/kernel/apic/io_apic.c31
-rw-r--r--arch/x86/kernel/apic/msi.c211
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c126
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c20
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/callthunks.c388
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c81
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c9
-rw-r--r--arch/x86/kernel/cpu/bugs.c216
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c184
-rw-r--r--arch/x86/kernel/cpu/common.c193
-rw-r--r--arch/x86/kernel/cpu/cpu.h10
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/hygon.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c268
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c7
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c61
-rw-r--r--arch/x86/kernel/cpu/mce/core.c33
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c3
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h54
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c81
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c259
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c211
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c120
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c42
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c107
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c173
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c58
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c23
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h39
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c124
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c10
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c348
-rw-r--r--arch/x86/kernel/cpu/scattered.c3
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c39
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c9
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c19
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h2
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c5
-rw-r--r--arch/x86/kernel/cpu/tsx.c39
-rw-r--r--arch/x86/kernel/cpu/umwait.c8
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/crash.c21
-rw-r--r--arch/x86/kernel/crash_dump_64.c2
-rw-r--r--arch/x86/kernel/devicetree.c18
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_32.c4
-rw-r--r--arch/x86/kernel/dumpstack_64.c2
-rw-r--r--arch/x86/kernel/e820.c6
-rw-r--r--arch/x86/kernel/espfix_64.c12
-rw-r--r--arch/x86/kernel/fpu/context.h2
-rw-r--r--arch/x86/kernel/fpu/core.c27
-rw-r--r--arch/x86/kernel/fpu/init.c7
-rw-r--r--arch/x86/kernel/fpu/regset.c2
-rw-r--r--arch/x86/kernel/fpu/signal.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c94
-rw-r--r--arch/x86/kernel/fpu/xstate.h4
-rw-r--r--arch/x86/kernel/ftrace.c28
-rw-r--r--arch/x86/kernel/ftrace_32.S5
-rw-r--r--arch/x86/kernel/ftrace_64.S45
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_32.S22
-rw-r--r--arch/x86/kernel/head_64.S106
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c6
-rw-r--r--arch/x86/kernel/i8259.c3
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/itmt.c11
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c95
-rw-r--r--arch/x86/kernel/kprobes/opt.c34
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/machine_kexec_64.c11
-rw-r--r--arch/x86/kernel/module.c157
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c116
-rw-r--r--arch/x86/kernel/paravirt.c54
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/process.c81
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c81
-rw-r--r--arch/x86/kernel/ptrace.c174
-rw-r--r--arch/x86/kernel/pvclock.c22
-rw-r--r--arch/x86/kernel/reboot.c90
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S15
-rw-r--r--arch/x86/kernel/resource.c12
-rw-r--r--arch/x86/kernel/rtc.c9
-rw-r--r--arch/x86/kernel/setup.c27
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/sev.c51
-rw-r--r--arch/x86/kernel/signal.c656
-rw-r--r--arch/x86/kernel/signal_32.c (renamed from arch/x86/ia32/ia32_signal.c)245
-rw-r--r--arch/x86/kernel/signal_64.c510
-rw-r--r--arch/x86/kernel/signal_compat.c191
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c54
-rw-r--r--arch/x86/kernel/static_call.c53
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/traps.c16
-rw-r--r--arch/x86/kernel/tsc.c77
-rw-r--r--arch/x86/kernel/unwind_orc.c56
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S38
-rw-r--r--arch/x86/kernel/x86_init.c9
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/cpuid.c173
-rw-r--r--arch/x86/kvm/debugfs.c2
-rw-r--r--arch/x86/kvm/emulate.c389
-rw-r--r--arch/x86/kvm/hyperv.c451
-rw-r--r--arch/x86/kvm/hyperv.h91
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/i8259.c4
-rw-r--r--arch/x86/kvm/ioapic.c37
-rw-r--r--arch/x86/kvm/irq.c8
-rw-r--r--arch/x86/kvm/irq_comm.c12
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h35
-rw-r--r--arch/x86/kvm/kvm_emulate.h55
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c34
-rw-r--r--arch/x86/kvm/kvm_onhyperv.h10
-rw-r--r--arch/x86/kvm/lapic.c430
-rw-r--r--arch/x86/kvm/lapic.h10
-rw-r--r--arch/x86/kvm/mmu.h34
-rw-r--r--arch/x86/kvm/mmu/mmu.c1008
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h71
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h304
-rw-r--r--arch/x86/kvm/mmu/spte.c22
-rw-r--r--arch/x86/kvm/mmu/spte.h41
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c12
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h48
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c479
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h27
-rw-r--r--arch/x86/kvm/mtrr.c1
-rw-r--r--arch/x86/kvm/pmu.c398
-rw-r--r--arch/x86/kvm/pmu.h48
-rw-r--r--arch/x86/kvm/reverse_cpuid.h36
-rw-r--r--arch/x86/kvm/smm.c648
-rw-r--r--arch/x86/kvm/smm.h168
-rw-r--r--arch/x86/kvm/svm/avic.c411
-rw-r--r--arch/x86/kvm/svm/hyperv.c18
-rw-r--r--arch/x86/kvm/svm/hyperv.h50
-rw-r--r--arch/x86/kvm/svm/nested.c164
-rw-r--r--arch/x86/kvm/svm/pmu.c8
-rw-r--r--arch/x86/kvm/svm/sev.c40
-rw-r--r--arch/x86/kvm/svm/svm.c445
-rw-r--r--arch/x86/kvm/svm/svm.h92
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c9
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h53
-rw-r--r--arch/x86/kvm/svm/vmenter.S1
-rw-r--r--arch/x86/kvm/trace.h36
-rw-r--r--arch/x86/kvm/vmx/capabilities.h28
-rw-r--r--arch/x86/kvm/vmx/hyperv.c (renamed from arch/x86/kvm/vmx/evmcs.c)239
-rw-r--r--arch/x86/kvm/vmx/hyperv.h (renamed from arch/x86/kvm/vmx/evmcs.h)101
-rw-r--r--arch/x86/kvm/vmx/nested.c305
-rw-r--r--arch/x86/kvm/vmx/nested.h7
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c168
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/sgx.c24
-rw-r--r--arch/x86/kvm/vmx/vmcs.h4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S84
-rw-r--r--arch/x86/kvm/vmx/vmx.c844
-rw-r--r--arch/x86/kvm/vmx/vmx.h38
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h48
-rw-r--r--arch/x86/kvm/x86.c1426
-rw-r--r--arch/x86/kvm/x86.h83
-rw-r--r--arch/x86/kvm/xen.c766
-rw-r--r--arch/x86/kvm/xen.h20
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/clear_page_64.S183
-rw-r--r--arch/x86/lib/cmdline.c4
-rw-r--r--arch/x86/lib/copy_user_64.S478
-rw-r--r--arch/x86/lib/copy_user_uncached_64.S242
-rw-r--r--arch/x86/lib/error-inject.c1
-rw-r--r--arch/x86/lib/getuser.S83
-rw-r--r--arch/x86/lib/insn-eval.c20
-rw-r--r--arch/x86/lib/iomap_copy_64.S2
-rw-r--r--arch/x86/lib/memcpy_32.c187
-rw-r--r--arch/x86/lib/memcpy_64.S39
-rw-r--r--arch/x86/lib/memmove_32.S200
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S51
-rw-r--r--arch/x86/lib/misc.c2
-rw-r--r--arch/x86/lib/putuser.S104
-rw-r--r--arch/x86/lib/retpoline.S113
-rw-r--r--arch/x86/lib/usercopy_64.c15
-rw-r--r--arch/x86/lib/x86-opcode-map.txt1
-rw-r--r--arch/x86/mm/cpu_entry_area.c65
-rw-r--r--arch/x86/mm/debug_pagetables.c1
-rw-r--r--arch/x86/mm/extable.c40
-rw-r--r--arch/x86/mm/fault.c59
-rw-r--r--arch/x86/mm/init.c36
-rw-r--r--arch/x86/mm/init_64.c133
-rw-r--r--arch/x86/mm/ioremap.c13
-rw-r--r--arch/x86/mm/kasan_init_64.c53
-rw-r--r--arch/x86/mm/kaslr.c8
-rw-r--r--arch/x86/mm/kmmio.c50
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c10
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S4
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c21
-rw-r--r--arch/x86/mm/pat/cpa-test.c4
-rw-r--r--arch/x86/mm/pat/memtype.c187
-rw-r--r--arch/x86/mm/pat/set_memory.c126
-rw-r--r--arch/x86/mm/pgtable.c22
-rw-r--r--arch/x86/mm/pti.c2
-rw-r--r--arch/x86/mm/tlb.c57
-rw-r--r--arch/x86/net/bpf_jit_comp.c355
-rw-r--r--arch/x86/pci/acpi.c39
-rw-r--r--arch/x86/pci/fixup.c80
-rw-r--r--arch/x86/pci/mmconfig-shared.c44
-rw-r--r--arch/x86/pci/xen.c10
-rw-r--r--arch/x86/platform/efi/Makefile5
-rw-r--r--arch/x86/platform/efi/efi.c56
-rw-r--r--arch/x86/platform/efi/efi_64.c8
-rw-r--r--arch/x86/platform/efi/fake_mem.c197
-rw-r--r--arch/x86/platform/efi/memmap.c239
-rw-r--r--arch/x86/platform/efi/runtime-map.c194
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c3
-rw-r--r--arch/x86/platform/pvh/enlighten.c2
-rw-r--r--arch/x86/platform/pvh/head.S2
-rw-r--r--arch/x86/platform/uv/uv_irq.c7
-rw-r--r--arch/x86/power/cpu.c27
-rw-r--r--arch/x86/power/hibernate.c2
-rw-r--r--arch/x86/purgatory/Makefile8
-rw-r--r--arch/x86/realmode/init.c8
-rw-r--r--arch/x86/tools/Makefile2
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/elf.h4
-rw-r--r--arch/x86/um/elfcore.c4
-rw-r--r--arch/x86/um/mem_32.c2
-rw-r--r--arch/x86/um/os-Linux/Makefile2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h8
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h8
-rw-r--r--arch/x86/um/stub_segv.c2
-rw-r--r--arch/x86/um/vdso/Makefile2
-rw-r--r--arch/x86/um/vdso/um_vdso.c12
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/enlighten_pv.c15
-rw-r--r--arch/x86/xen/enlighten_pvh.c13
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu_pv.c12
-rw-r--r--arch/x86/xen/p2m.c5
-rw-r--r--arch/x86/xen/setup.c7
-rw-r--r--arch/x86/xen/smp.c24
-rw-r--r--arch/x86/xen/smp.h2
-rw-r--r--arch/x86/xen/smp_pv.c29
-rw-r--r--arch/x86/xen/spinlock.c6
-rw-r--r--arch/x86/xen/time.c47
-rw-r--r--arch/x86/xen/vga.c5
-rw-r--r--arch/x86/xen/xen-asm.S12
-rw-r--r--arch/x86/xen/xen-head.S13
-rw-r--r--arch/x86/xen/xen-ops.h7
532 files changed, 23121 insertions, 12102 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 67745ceab0db..53bab123a8ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
# Options that are inherently 64-bit kernel only:
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ select ARCH_SUPPORTS_PER_VMA_LOCK
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
@@ -69,6 +70,7 @@ config X86
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE
@@ -81,6 +83,7 @@ config X86
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+ select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
@@ -123,8 +126,8 @@ config X86
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
- select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64
select ARCH_WANT_LD_ORPHAN_WARN
+ select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64
select ARCH_WANTS_THP_SWAP if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
@@ -157,9 +160,10 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
- select GUP_GET_PTE_LOW_HIGH if X86_PAE
+ select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
+ select HAS_IOPORT
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
@@ -195,6 +199,7 @@ config X86
select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER
select HAVE_C_RECORDMCOUNT
select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
+ select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT
select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
@@ -280,7 +285,6 @@ config X86
select RTC_LIB
select RTC_MC146818_LIB
select SPARSE_IRQ
- select SRCU
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select TRACE_IRQFLAGS_SUPPORT
@@ -290,6 +294,8 @@ config X86
select X86_FEATURE_NAMES if PROC_FS
select PROC_PID_ARCH_STATUS if PROC_FS
select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
+ select FUNCTION_ALIGNMENT_16B if X86_64 || X86_ALIGNMENT_16
+ select FUNCTION_ALIGNMENT_4B
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
@@ -357,11 +363,6 @@ config ARCH_HAS_CPU_RELAX
config ARCH_HIBERNATION_POSSIBLE
def_bool y
-config ARCH_NR_GPIO
- int
- default 1024 if X86_64
- default 512
-
config ARCH_SUSPEND_POSSIBLE
def_bool y
@@ -434,7 +435,7 @@ config SMP
Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
Management" code will be disabled if you say Y here.
- See also <file:Documentation/x86/i386/IO-APIC.rst>,
+ See also <file:Documentation/arch/x86/i386/IO-APIC.rst>,
<file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
<http://www.tldp.org/docs.html#howto>.
@@ -462,8 +463,8 @@ config X86_X2APIC
Some Intel systems circa 2022 and later are locked into x2APIC mode
and can not fall back to the legacy APIC modes if SGX or TDX are
- enabled in the BIOS. They will be unable to boot without enabling
- this option.
+ enabled in the BIOS. They will boot with very reduced functionality
+ without enabling this option.
If you don't know what to do here, say N.
@@ -1109,7 +1110,6 @@ config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
select IRQ_DOMAIN_HIERARCHY
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
config X86_IO_APIC
def_bool y
@@ -1325,7 +1325,7 @@ config MICROCODE
the Linux kernel.
The preferred method to load microcode from a detached initrd is described
- in Documentation/x86/microcode.rst. For that you need to enable
+ in Documentation/arch/x86/microcode.rst. For that you need to enable
CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
initrd for microcode blobs.
@@ -1503,7 +1503,7 @@ config X86_5LEVEL
depends on X86_64
help
5-level paging enables access to larger address space:
- upto 128 PiB of virtual address space and 4 PiB of
+ up to 128 PiB of virtual address space and 4 PiB of
physical address space.
It will be supported by future Intel CPUs.
@@ -1511,7 +1511,7 @@ config X86_5LEVEL
A kernel with the option enabled can be booted on machines that
support 4- or 5-level paging.
- See Documentation/x86/x86_64/5level-paging.rst for more
+ See Documentation/arch/x86/x86_64/5level-paging.rst for more
information.
Say N if unsure.
@@ -1775,7 +1775,7 @@ config MTRR
You can safely say Y even if your machine doesn't have MTRRs, you'll
just add about 9 KB to your kernel.
- See <file:Documentation/x86/mtrr.rst> for more information.
+ See <file:Documentation/arch/x86/mtrr.rst> for more information.
config MTRR_SANITIZER
def_bool y
@@ -1854,7 +1854,7 @@ config CC_HAS_IBT
config X86_KERNEL_IBT
prompt "Indirect Branch Tracking"
- bool
+ def_bool y
depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL
# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
depends on !LD_IS_LLD || LLD_VERSION >= 140000
@@ -1939,7 +1939,6 @@ config X86_SGX
depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
- select SRCU
select MMU_NOTIFIER
select NUMA_KEEP_MEMINFO if NUMA
select XARRAY_MULTI
@@ -1980,6 +1979,23 @@ config EFI_STUB
See Documentation/admin-guide/efi-stub.rst for more information.
+config EFI_HANDOVER_PROTOCOL
+ bool "EFI handover protocol (DEPRECATED)"
+ depends on EFI_STUB
+ default y
+ help
+ Select this in order to include support for the deprecated EFI
+ handover protocol, which defines alternative entry points into the
+ EFI stub. This is a practice that has no basis in the UEFI
+ specification, and requires a priori knowledge on the part of the
+ bootloader about Linux/x86 specific ways of passing the command line
+ and initrd, and where in memory those assets may be loaded.
+
+ If in doubt, say Y. Even though the corresponding support is not
+ present in upstream GRUB or other bootloaders, most distros build
+ GRUB with numerous downstream patches applied, and may rely on the
+ handover protocol as as result.
+
config EFI_MIXED
bool "EFI mixed-mode support"
depends on EFI_STUB && X86_64
@@ -1994,6 +2010,37 @@ config EFI_MIXED
If unsure, say N.
+config EFI_FAKE_MEMMAP
+ bool "Enable EFI fake memory map"
+ depends on EFI
+ help
+ Saying Y here will enable "efi_fake_mem" boot option. By specifying
+ this parameter, you can add arbitrary attribute to specific memory
+ range by updating original (firmware provided) EFI memmap. This is
+ useful for debugging of EFI memmap related feature, e.g., Address
+ Range Mirroring feature.
+
+config EFI_MAX_FAKE_MEM
+ int "maximum allowable number of ranges in efi_fake_mem boot option"
+ depends on EFI_FAKE_MEMMAP
+ range 1 128
+ default 8
+ help
+ Maximum allowable number of ranges in efi_fake_mem boot option.
+ Ranges can be set up to this value using comma-separated list.
+ The default value is 8.
+
+config EFI_RUNTIME_MAP
+ bool "Export EFI runtime maps to sysfs" if EXPERT
+ depends on EFI
+ default KEXEC_CORE
+ help
+ Export EFI runtime memory regions to /sys/firmware/efi/runtime-map.
+ That memory map is required by the 2nd kernel to set up EFI virtual
+ mappings after kexec, but can also be used for debugging purposes.
+
+ See also Documentation/ABI/testing/sysfs-firmware-efi-runtime-map.
+
source "kernel/Kconfig.hz"
config KEXEC
@@ -2243,6 +2290,17 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
If unsure, leave at the default value.
+config ADDRESS_MASKING
+ bool "Linear Address Masking support"
+ depends on X86_64
+ help
+ Linear Address Masking (LAM) modifies the checking that is applied
+ to 64-bit linear addresses, allowing software to use of the
+ untranslated address bits for metadata.
+
+ The capability can be used for efficient address sanitizers (ASAN)
+ implementation and for optimizations in JITs.
+
config HOTPLUG_CPU
def_bool y
depends on SMP
@@ -2443,6 +2501,46 @@ config CC_HAS_SLS
config CC_HAS_RETURN_THUNK
def_bool $(cc-option,-mfunction-return=thunk-extern)
+config CC_HAS_ENTRY_PADDING
+ def_bool $(cc-option,-fpatchable-function-entry=16,16)
+
+config FUNCTION_PADDING_CFI
+ int
+ default 59 if FUNCTION_ALIGNMENT_64B
+ default 27 if FUNCTION_ALIGNMENT_32B
+ default 11 if FUNCTION_ALIGNMENT_16B
+ default 3 if FUNCTION_ALIGNMENT_8B
+ default 0
+
+# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG
+# except Kconfig can't do arithmetic :/
+config FUNCTION_PADDING_BYTES
+ int
+ default FUNCTION_PADDING_CFI if CFI_CLANG
+ default FUNCTION_ALIGNMENT
+
+config CALL_PADDING
+ def_bool n
+ depends on CC_HAS_ENTRY_PADDING && OBJTOOL
+ select FUNCTION_ALIGNMENT_16B
+
+config FINEIBT
+ def_bool y
+ depends on X86_KERNEL_IBT && CFI_CLANG && RETPOLINE
+ select CALL_PADDING
+
+config HAVE_CALL_THUNKS
+ def_bool y
+ depends on CC_HAS_ENTRY_PADDING && RETHUNK && OBJTOOL
+
+config CALL_THUNKS
+ def_bool n
+ select CALL_PADDING
+
+config PREFIX_SYMBOLS
+ def_bool y
+ depends on CALL_PADDING && !CFI_CLANG
+
menuconfig SPECULATION_MITIGATIONS
bool "Mitigations for speculative execution vulnerabilities"
default y
@@ -2464,7 +2562,7 @@ config PAGE_TABLE_ISOLATION
ensuring that the majority of kernel addresses are not mapped
into userspace.
- See Documentation/x86/pti.rst for more details.
+ See Documentation/arch/x86/pti.rst for more details.
config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
@@ -2494,6 +2592,37 @@ config CPU_UNRET_ENTRY
help
Compile the kernel with support for the retbleed=unret mitigation.
+config CALL_DEPTH_TRACKING
+ bool "Mitigate RSB underflow with call depth tracking"
+ depends on CPU_SUP_INTEL && HAVE_CALL_THUNKS
+ select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+ select CALL_THUNKS
+ default y
+ help
+ Compile the kernel with call depth tracking to mitigate the Intel
+ SKL Return-Speculation-Buffer (RSB) underflow issue. The
+ mitigation is off by default and needs to be enabled on the
+ kernel command line via the retbleed=stuff option. For
+ non-affected systems the overhead of this option is marginal as
+ the call depth tracking is using run-time generated call thunks
+ in a compiler generated padding area and call patching. This
+ increases text size by ~5%. For non affected systems this space
+ is unused. On affected SKL systems this results in a significant
+ performance gain over the IBRS mitigation.
+
+config CALL_THUNKS_DEBUG
+ bool "Enable call thunks and call depth tracking debugging"
+ depends on CALL_DEPTH_TRACKING
+ select FUNCTION_ALIGNMENT_32B
+ default n
+ help
+ Enable call/ret counters for imbalance detection and build in
+ a noisy dmesg about callthunks generation and call patching for
+ trouble shooting. The debug prints need to be enabled on the
+ kernel command line with 'debug-callthunks'.
+ Only enable this when you are debugging call thunks as this
+ creates a noticeable runtime overhead. If unsure say N.
+
config CPU_IBPB_ENTRY
bool "Enable IBPB on kernel entry"
depends on CPU_SUP_AMD && X86_64
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 26b8c08e2fc4..b88f784cb02e 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -19,3 +19,8 @@ config AS_TPAUSE
def_bool $(as-instr,tpause %ecx)
help
Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7
+
+config AS_GFNI
+ def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2)
+ help
+ Supported by binutils >= 2.30 and LLVM integrated assembler
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bdfe08f1a930..c5d614d28a75 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -97,7 +97,7 @@ config IOMMU_DEBUG
code. When you use it make sure you have a big enough
IOMMU/AGP aperture. Most of the options enabled by this can
be set more finegrained using the iommu= command line
- options. See Documentation/x86/x86_64/boot-options.rst for more
+ options. See Documentation/arch/x86/x86_64/boot-options.rst for more
details.
config IOMMU_LEAK
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 415a5d138de4..fdc2e3abd615 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -3,10 +3,10 @@
# select defconfig based on actual architecture
ifeq ($(ARCH),x86)
- ifeq ($(shell uname -m),x86_64)
- KBUILD_DEFCONFIG := x86_64_defconfig
- else
+ ifeq ($(shell uname -m | sed -e 's/i.86/i386/'),i386)
KBUILD_DEFCONFIG := i386_defconfig
+ else
+ KBUILD_DEFCONFIG := x86_64_defconfig
endif
else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
@@ -14,13 +14,13 @@ endif
ifdef CONFIG_CC_IS_GCC
RETPOLINE_CFLAGS := $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
-RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
RETPOLINE_VDSO_CFLAGS := $(call cc-option,-mindirect-branch=thunk-inline -mindirect-branch-register)
endif
ifdef CONFIG_CC_IS_CLANG
RETPOLINE_CFLAGS := -mretpoline-external-thunk
RETPOLINE_VDSO_CFLAGS := -mretpoline
endif
+RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch-cs-prefix)
ifdef CONFIG_RETHUNK
RETHUNK_CFLAGS := -mfunction-return=thunk-extern
@@ -208,10 +208,16 @@ ifdef CONFIG_SLS
KBUILD_CFLAGS += -mharden-sls=all
endif
+ifdef CONFIG_CALL_PADDING
+PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
+KBUILD_CFLAGS += $(PADDING_CFLAGS)
+export PADDING_CFLAGS
+endif
+
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
ifdef CONFIG_LTO_CLANG
-ifeq ($(shell test $(CONFIG_LLD_VERSION) -lt 130000; echo $$?),0)
+ifeq ($(call test-lt, $(CONFIG_LLD_VERSION), 130000),y)
KBUILD_LDFLAGS += -plugin-opt=-stack-alignment=$(if $(CONFIG_X86_32),4,8)
endif
endif
@@ -299,6 +305,18 @@ ifeq ($(RETPOLINE_CFLAGS),)
endif
endif
+ifdef CONFIG_UNWINDER_ORC
+orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h
+orc_hash_sh := $(srctree)/scripts/orc_hash.sh
+targets += $(orc_hash_h)
+quiet_cmd_orc_hash = GEN $@
+ cmd_orc_hash = mkdir -p $(dir $@); \
+ $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@
+$(orc_hash_h): $(srctree)/arch/x86/include/asm/orc_types.h $(orc_hash_sh) FORCE
+ $(call if_changed,orc_hash)
+archprepare: $(orc_hash_h)
+endif
+
archclean:
$(Q)rm -rf $(objtree)/arch/i386
$(Q)rm -rf $(objtree)/arch/x86_64
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index b3c1ae084180..2106a2bd152b 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -1,6 +1,17 @@
# SPDX-License-Identifier: GPL-2.0
core-y += arch/x86/crypto/
+#
+# Disable SSE and other FP/SIMD instructions to match normal x86
+# This is required to work around issues in older LLVM versions, but breaks
+# GCC versions < 11. See:
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652
+#
+ifeq ($(CONFIG_CC_IS_CLANG),y)
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+endif
+
ifeq ($(CONFIG_X86_32),y)
START := 0x8048000
@@ -17,7 +28,7 @@ LDS_EXTRA := -Ui386
export LDS_EXTRA
# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
-include arch/x86/Makefile_32.cpu
+include $(srctree)/arch/x86/Makefile_32.cpu
# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 9860ca5979f8..9e38ffaadb5d 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -83,7 +83,7 @@ cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
$(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
$(call if_changed,image)
- @$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')'
+ @$(kecho) 'Kernel: $@ is ready' ' (#'$(or $(KBUILD_BUILD_VERSION),`cat .version`)')'
OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 5521ea12f44e..aa9b96457584 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -32,7 +32,7 @@ intcall:
movw %dx, %si
movw %sp, %di
movw $11, %cx
- rep; movsd
+ rep; movsl
/* Pop full state from the stack */
popal
@@ -67,7 +67,7 @@ intcall:
jz 4f
movw %sp, %si
movw $11, %cx
- rep; movsd
+ rep; movsl
4: addw $44, %sp
/* Restore state and return */
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 3a261abb6d15..6b6cfe607bdb 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
# Disable relocation relaxation in case the link is not PIE.
-KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
# sev.c indirectly inludes inat-table.h which is generated during
@@ -68,7 +68,7 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info)
# address by the bootloader.
LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker)
ifdef CONFIG_LD_ORPHAN_WARN
-LDFLAGS_vmlinux += --orphan-handling=warn
+LDFLAGS_vmlinux += --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
endif
LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y)
@@ -100,7 +100,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/ident_map_64.o
vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
- vmlinux-objs-y += $(obj)/mem_encrypt.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev.o
endif
@@ -108,11 +108,11 @@ endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o
-vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
vmlinux-objs-$(CONFIG_EFI) += $(obj)/efi.o
-efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_mixed.o
+vmlinux-objs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
-$(obj)/vmlinux: $(vmlinux-objs-y) $(efi-obj-y) FORCE
+$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
$(call if_changed,ld)
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S
new file mode 100644
index 000000000000..4ca70bf93dc0
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_mixed.S
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT and IDT before we make EFI service
+ * calls.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identity
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .code64
+ .text
+/*
+ * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
+ * is the first thing that runs after switching to long mode. Depending on
+ * whether the EFI handover protocol or the compat entry point was used to
+ * enter the kernel, it will either branch to the 64-bit EFI handover
+ * entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF
+ * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
+ * struct bootparams pointer as the third argument, so the presence of such a
+ * pointer is used to disambiguate.
+ *
+ * +--------------+
+ * +------------------+ +------------+ +------>| efi_pe_entry |
+ * | efi32_pe_entry |---->| | | +-----------+--+
+ * +------------------+ | | +------+----------------+ |
+ * | startup_32 |---->| startup_64_mixed_mode | |
+ * +------------------+ | | +------+----------------+ V
+ * | efi32_stub_entry |---->| | | +------------------+
+ * +------------------+ +------------+ +---->| efi64_stub_entry |
+ * +-------------+----+
+ * +------------+ +----------+ |
+ * | startup_64 |<----| efi_main |<--------------+
+ * +------------+ +----------+
+ */
+SYM_FUNC_START(startup_64_mixed_mode)
+ lea efi32_boot_args(%rip), %rdx
+ mov 0(%rdx), %edi
+ mov 4(%rdx), %esi
+ mov 8(%rdx), %edx // saved bootparams pointer
+ test %edx, %edx
+ jnz efi64_stub_entry
+ /*
+ * efi_pe_entry uses MS calling convention, which requires 32 bytes of
+ * shadow space on the stack even if all arguments are passed in
+ * registers. We also need an additional 8 bytes for the space that
+ * would be occupied by the return address, and this also results in
+ * the correct stack alignment for entry.
+ */
+ sub $40, %rsp
+ mov %rdi, %rcx // MS calling convention
+ mov %rsi, %rdx
+ jmp efi_pe_entry
+SYM_FUNC_END(startup_64_mixed_mode)
+
+SYM_FUNC_START(__efi64_thunk)
+ push %rbp
+ push %rbx
+
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ /* Copy args passed on stack */
+ movq 0x30(%rsp), %rbp
+ movq 0x38(%rsp), %rbx
+ movq 0x40(%rsp), %rax
+
+ /*
+ * Convert x86-64 ABI params to i386 ABI
+ */
+ subq $64, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movl %r8d, 0xc(%rsp)
+ movl %r9d, 0x10(%rsp)
+ movl %ebp, 0x14(%rsp)
+ movl %ebx, 0x18(%rsp)
+ movl %eax, 0x1c(%rsp)
+
+ leaq 0x20(%rsp), %rbx
+ sgdt (%rbx)
+ sidt 16(%rbx)
+
+ leaq 1f(%rip), %rbp
+
+ /*
+ * Switch to IDT and GDT with 32-bit segments. These are the firmware
+ * GDT and IDT that were installed when the kernel started executing.
+ * The pointers were saved by the efi32_entry() routine below.
+ *
+ * Pass the saved DS selector to the 32-bit code, and use far return to
+ * restore the saved CS selector.
+ */
+ lidt efi32_boot_idt(%rip)
+ lgdt efi32_boot_gdt(%rip)
+
+ movzwl efi32_boot_ds(%rip), %edx
+ movzwq efi32_boot_cs(%rip), %rax
+ pushq %rax
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $64, %rsp
+ movq %rdi, %rax
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+ /* Clear out 32-bit selector from FS and GS */
+ xorl %ebx, %ebx
+ movl %ebx, %fs
+ movl %ebx, %gs
+
+ pop %rbx
+ pop %rbp
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+SYM_FUNC_START_LOCAL(efi_enter32)
+ /* Load firmware selector into data and stack segment registers */
+ movl %edx, %ds
+ movl %edx, %es
+ movl %edx, %fs
+ movl %edx, %gs
+ movl %edx, %ss
+
+ /* Reload pgtables */
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ /*
+ * Some firmware will return with interrupts enabled. Be sure to
+ * disable them before we switch GDTs and IDTs.
+ */
+ cli
+
+ lidtl 16(%ebx)
+ lgdtl (%ebx)
+
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ xorl %eax, %eax
+ lldt %ax
+
+ pushl $__KERNEL_CS
+ pushl %ebp
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ lret
+SYM_FUNC_END(efi_enter32)
+
+/*
+ * This is the common EFI stub entry point for mixed mode.
+ *
+ * Arguments: %ecx image handle
+ * %edx EFI system table pointer
+ * %esi struct bootparams pointer (or NULL when not using
+ * the EFI handover protocol)
+ *
+ * Since this is the point of no return for ordinary execution, no registers
+ * are considered live except for the function parameters. [Note that the EFI
+ * stub may still exit and return to the firmware using the Exit() EFI boot
+ * service.]
+ */
+SYM_FUNC_START(efi32_entry)
+ call 1f
+1: pop %ebx
+
+ /* Save firmware GDTR and code/data selectors */
+ sgdtl (efi32_boot_gdt - 1b)(%ebx)
+ movw %cs, (efi32_boot_cs - 1b)(%ebx)
+ movw %ds, (efi32_boot_ds - 1b)(%ebx)
+
+ /* Store firmware IDT descriptor */
+ sidtl (efi32_boot_idt - 1b)(%ebx)
+
+ /* Store boot arguments */
+ leal (efi32_boot_args - 1b)(%ebx), %ebx
+ movl %ecx, 0(%ebx)
+ movl %edx, 4(%ebx)
+ movl %esi, 8(%ebx)
+ movb $0x0, 12(%ebx) // efi_is64
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ jmp startup_32
+SYM_FUNC_END(efi32_entry)
+
+#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
+#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
+#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
+
+/*
+ * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
+ * efi_system_table_32_t *sys_table)
+ */
+SYM_FUNC_START(efi32_pe_entry)
+ pushl %ebp
+ movl %esp, %ebp
+ pushl %eax // dummy push to allocate loaded_image
+
+ pushl %ebx // save callee-save registers
+ pushl %edi
+
+ call verify_cpu // check for long mode support
+ testl %eax, %eax
+ movl $0x80000003, %eax // EFI_UNSUPPORTED
+ jnz 2f
+
+ call 1f
+1: pop %ebx
+
+ /* Get the loaded image protocol pointer from the image handle */
+ leal -4(%ebp), %eax
+ pushl %eax // &loaded_image
+ leal (loaded_image_proto - 1b)(%ebx), %eax
+ pushl %eax // pass the GUID address
+ pushl 8(%ebp) // pass the image handle
+
+ /*
+ * Note the alignment of the stack frame.
+ * sys_table
+ * handle <-- 16-byte aligned on entry by ABI
+ * return address
+ * frame pointer
+ * loaded_image <-- local variable
+ * saved %ebx <-- 16-byte aligned here
+ * saved %edi
+ * &loaded_image
+ * &loaded_image_proto
+ * handle <-- 16-byte aligned for call to handle_protocol
+ */
+
+ movl 12(%ebp), %eax // sys_table
+ movl ST32_boottime(%eax), %eax // sys_table->boottime
+ call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
+ addl $12, %esp // restore argument space
+ testl %eax, %eax
+ jnz 2f
+
+ movl 8(%ebp), %ecx // image_handle
+ movl 12(%ebp), %edx // sys_table
+ movl -4(%ebp), %esi // loaded_image
+ movl LI32_image_base(%esi), %esi // loaded_image->image_base
+ leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
+ /*
+ * We need to set the image_offset variable here since startup_32() will
+ * use it before we get to the 64-bit efi_pe_entry() in C code.
+ */
+ subl %esi, %ebp // calculate image_offset
+ movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
+ xorl %esi, %esi
+ jmp efi32_entry // pass %ecx, %edx, %esi
+ // no other registers remain live
+
+2: popl %edi // restore callee-save registers
+ popl %ebx
+ leave
+ RET
+SYM_FUNC_END(efi32_pe_entry)
+
+ .section ".rodata"
+ /* EFI loaded image protocol GUID */
+ .balign 4
+SYM_DATA_START_LOCAL(loaded_image_proto)
+ .long 0x5b1b31a1
+ .word 0x9562, 0x11d2
+ .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
+SYM_DATA_END(loaded_image_proto)
+
+ .data
+ .balign 8
+SYM_DATA_START_LOCAL(efi32_boot_gdt)
+ .word 0
+ .quad 0
+SYM_DATA_END(efi32_boot_gdt)
+
+SYM_DATA_START_LOCAL(efi32_boot_idt)
+ .word 0
+ .quad 0
+SYM_DATA_END(efi32_boot_idt)
+
+SYM_DATA_LOCAL(efi32_boot_cs, .word 0)
+SYM_DATA_LOCAL(efi32_boot_ds, .word 0)
+SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
+SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
deleted file mode 100644
index 67e7edcdfea8..000000000000
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
- *
- * Early support for invoking 32-bit EFI services from a 64-bit kernel.
- *
- * Because this thunking occurs before ExitBootServices() we have to
- * restore the firmware's 32-bit GDT and IDT before we make EFI service
- * calls.
- *
- * On the plus side, we don't have to worry about mangling 64-bit
- * addresses into 32-bits because we're executing with an identity
- * mapped pagetable and haven't transitioned to 64-bit virtual addresses
- * yet.
- */
-
-#include <linux/linkage.h>
-#include <asm/msr.h>
-#include <asm/page_types.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
- .code64
- .text
-SYM_FUNC_START(__efi64_thunk)
- push %rbp
- push %rbx
-
- movl %ds, %eax
- push %rax
- movl %es, %eax
- push %rax
- movl %ss, %eax
- push %rax
-
- /* Copy args passed on stack */
- movq 0x30(%rsp), %rbp
- movq 0x38(%rsp), %rbx
- movq 0x40(%rsp), %rax
-
- /*
- * Convert x86-64 ABI params to i386 ABI
- */
- subq $64, %rsp
- movl %esi, 0x0(%rsp)
- movl %edx, 0x4(%rsp)
- movl %ecx, 0x8(%rsp)
- movl %r8d, 0xc(%rsp)
- movl %r9d, 0x10(%rsp)
- movl %ebp, 0x14(%rsp)
- movl %ebx, 0x18(%rsp)
- movl %eax, 0x1c(%rsp)
-
- leaq 0x20(%rsp), %rbx
- sgdt (%rbx)
-
- addq $16, %rbx
- sidt (%rbx)
-
- leaq 1f(%rip), %rbp
-
- /*
- * Switch to IDT and GDT with 32-bit segments. This is the firmware GDT
- * and IDT that was installed when the kernel started executing. The
- * pointers were saved at the EFI stub entry point in head_64.S.
- *
- * Pass the saved DS selector to the 32-bit code, and use far return to
- * restore the saved CS selector.
- */
- leaq efi32_boot_idt(%rip), %rax
- lidt (%rax)
- leaq efi32_boot_gdt(%rip), %rax
- lgdt (%rax)
-
- movzwl efi32_boot_ds(%rip), %edx
- movzwq efi32_boot_cs(%rip), %rax
- pushq %rax
- leaq efi_enter32(%rip), %rax
- pushq %rax
- lretq
-
-1: addq $64, %rsp
- movq %rdi, %rax
-
- pop %rbx
- movl %ebx, %ss
- pop %rbx
- movl %ebx, %es
- pop %rbx
- movl %ebx, %ds
- /* Clear out 32-bit selector from FS and GS */
- xorl %ebx, %ebx
- movl %ebx, %fs
- movl %ebx, %gs
-
- /*
- * Convert 32-bit status code into 64-bit.
- */
- roll $1, %eax
- rorq $1, %rax
-
- pop %rbx
- pop %rbp
- RET
-SYM_FUNC_END(__efi64_thunk)
-
- .code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-SYM_FUNC_START_LOCAL(efi_enter32)
- /* Load firmware selector into data and stack segment registers */
- movl %edx, %ds
- movl %edx, %es
- movl %edx, %fs
- movl %edx, %gs
- movl %edx, %ss
-
- /* Reload pgtables */
- movl %cr3, %eax
- movl %eax, %cr3
-
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /* Disable long mode via EFER */
- movl $MSR_EFER, %ecx
- rdmsr
- btrl $_EFER_LME, %eax
- wrmsr
-
- call *%edi
-
- /* We must preserve return value */
- movl %eax, %edi
-
- /*
- * Some firmware will return with interrupts enabled. Be sure to
- * disable them before we switch GDTs and IDTs.
- */
- cli
-
- lidtl (%ebx)
- subl $16, %ebx
-
- lgdtl (%ebx)
-
- movl %cr4, %eax
- btsl $(X86_CR4_PAE_BIT), %eax
- movl %eax, %cr4
-
- movl %cr3, %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_LME, %eax
- wrmsr
-
- xorl %eax, %eax
- lldt %ax
-
- pushl $__KERNEL_CS
- pushl %ebp
-
- /* Enable paging */
- movl %cr0, %eax
- btsl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
- lret
-SYM_FUNC_END(efi_enter32)
-
- .data
- .balign 8
-SYM_DATA_START(efi32_boot_gdt)
- .word 0
- .quad 0
-SYM_DATA_END(efi32_boot_gdt)
-
-SYM_DATA_START(efi32_boot_idt)
- .word 0
- .quad 0
-SYM_DATA_END(efi32_boot_idt)
-
-SYM_DATA_START(efi32_boot_cs)
- .word 0
-SYM_DATA_END(efi32_boot_cs)
-
-SYM_DATA_START(efi32_boot_ds)
- .word 0
-SYM_DATA_END(efi32_boot_ds)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3b354eb9516d..987ae727cf9f 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -187,7 +187,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
leal boot_heap@GOTOFF(%ebx), %eax
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
- call extract_kernel /* returns kernel location in %eax */
+ call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp
/*
@@ -208,10 +208,6 @@ SYM_DATA_START_LOCAL(gdt)
.quad 0x00cf92000000ffff /* __KERNEL_DS */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
-#ifdef CONFIG_EFI_STUB
-SYM_DATA(image_offset, .long 0)
-#endif
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d33f060900d2..03c4328a88cb 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -38,6 +38,14 @@
#include "pgtable.h"
/*
+ * Fix alignment at 16 bytes. Following CONFIG_FUNCTION_ALIGNMENT will result
+ * in assembly errors due to trying to move .org backward due to the excessive
+ * alignment.
+ */
+#undef __ALIGN
+#define __ALIGN .balign 16, 0x90
+
+/*
* Locally defined symbols should be marked hidden:
*/
.hidden _bss
@@ -118,7 +126,9 @@ SYM_FUNC_START(startup_32)
1:
/* Setup Exception handling for SEV-ES */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
call startup32_load_idt
+#endif
/* Make sure cpu supports long mode. */
call verify_cpu
@@ -178,12 +188,13 @@ SYM_FUNC_START(startup_32)
*/
/*
* If SEV is active then set the encryption mask in the page tables.
- * This will insure that when the kernel is copied and decompressed
+ * This will ensure that when the kernel is copied and decompressed
* it will be done so encrypted.
*/
- call get_sev_encryption_bit
xorl %edx, %edx
#ifdef CONFIG_AMD_MEM_ENCRYPT
+ call get_sev_encryption_bit
+ xorl %edx, %edx
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
@@ -249,6 +260,11 @@ SYM_FUNC_START(startup_32)
movl $__BOOT_TSS, %eax
ltr %ax
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+#endif
+
/*
* Setup for the jump to 64bit mode
*
@@ -261,29 +277,11 @@ SYM_FUNC_START(startup_32)
*/
leal rva(startup_64)(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
- movl rva(efi32_boot_args)(%ebp), %edi
- testl %edi, %edi
- jz 1f
- leal rva(efi64_stub_entry)(%ebp), %eax
- movl rva(efi32_boot_args+4)(%ebp), %esi
- movl rva(efi32_boot_args+8)(%ebp), %edx // saved bootparams pointer
- testl %edx, %edx
- jnz 1f
- /*
- * efi_pe_entry uses MS calling convention, which requires 32 bytes of
- * shadow space on the stack even if all arguments are passed in
- * registers. We also need an additional 8 bytes for the space that
- * would be occupied by the return address, and this also results in
- * the correct stack alignment for entry.
- */
- subl $40, %esp
- leal rva(efi_pe_entry)(%ebp), %eax
- movl %edi, %ecx // MS calling convention
- movl %esi, %edx
+ cmpb $1, rva(efi_is64)(%ebp)
+ je 1f
+ leal rva(startup_64_mixed_mode)(%ebp), %eax
1:
#endif
- /* Check if the C-bit position is correct when SEV is active */
- call startup32_check_sev_cbit
pushl $__KERNEL_CS
pushl %eax
@@ -296,38 +294,14 @@ SYM_FUNC_START(startup_32)
lret
SYM_FUNC_END(startup_32)
-#ifdef CONFIG_EFI_MIXED
+#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
.org 0x190
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
-
- call 1f
-1: pop %ebp
- subl $ rva(1b), %ebp
-
- movl %esi, rva(efi32_boot_args+8)(%ebp)
-SYM_INNER_LABEL(efi32_pe_stub_entry, SYM_L_LOCAL)
- movl %ecx, rva(efi32_boot_args)(%ebp)
- movl %edx, rva(efi32_boot_args+4)(%ebp)
- movb $0, rva(efi_is64)(%ebp)
-
- /* Save firmware GDTR and code/data selectors */
- sgdtl rva(efi32_boot_gdt)(%ebp)
- movw %cs, rva(efi32_boot_cs)(%ebp)
- movw %ds, rva(efi32_boot_ds)(%ebp)
-
- /* Store firmware IDT descriptor */
- sidtl rva(efi32_boot_idt)(%ebp)
-
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- jmp startup_32
+ jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
@@ -550,7 +524,9 @@ trampoline_return:
SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
.org 0x390
+#endif
SYM_FUNC_START(efi64_stub_entry)
and $~0xf, %rsp /* realign the stack */
movq %rdx, %rbx /* save boot_params pointer */
@@ -593,7 +569,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
movl input_len(%rip), %ecx /* input_len */
movq %rbp, %r8 /* output target address */
movl output_len(%rip), %r9d /* decompressed length, end of relocs */
- call extract_kernel /* returns kernel location in %rax */
+ call extract_kernel /* returns kernel entry point in %rax */
popq %rsi
/*
@@ -713,6 +689,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
jmp 1b
SYM_FUNC_END(.Lno_longmode)
+ .globl verify_cpu
#include "../../kernel/verify_cpu.S"
.data
@@ -744,242 +721,6 @@ SYM_DATA_START(boot_idt)
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
-SYM_DATA_START(boot32_idt_desc)
- .word boot32_idt_end - boot32_idt - 1
- .long 0
-SYM_DATA_END(boot32_idt_desc)
- .balign 8
-SYM_DATA_START(boot32_idt)
- .rept 32
- .quad 0
- .endr
-SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end)
-#endif
-
-#ifdef CONFIG_EFI_STUB
-SYM_DATA(image_offset, .long 0)
-#endif
-#ifdef CONFIG_EFI_MIXED
-SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
-SYM_DATA(efi_is64, .byte 1)
-
-#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
-#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
-#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
-
- __HEAD
- .code32
-SYM_FUNC_START(efi32_pe_entry)
-/*
- * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
- * efi_system_table_32_t *sys_table)
- */
-
- pushl %ebp
- movl %esp, %ebp
- pushl %eax // dummy push to allocate loaded_image
-
- pushl %ebx // save callee-save registers
- pushl %edi
-
- call verify_cpu // check for long mode support
- testl %eax, %eax
- movl $0x80000003, %eax // EFI_UNSUPPORTED
- jnz 2f
-
- call 1f
-1: pop %ebx
- subl $ rva(1b), %ebx
-
- /* Get the loaded image protocol pointer from the image handle */
- leal -4(%ebp), %eax
- pushl %eax // &loaded_image
- leal rva(loaded_image_proto)(%ebx), %eax
- pushl %eax // pass the GUID address
- pushl 8(%ebp) // pass the image handle
-
- /*
- * Note the alignment of the stack frame.
- * sys_table
- * handle <-- 16-byte aligned on entry by ABI
- * return address
- * frame pointer
- * loaded_image <-- local variable
- * saved %ebx <-- 16-byte aligned here
- * saved %edi
- * &loaded_image
- * &loaded_image_proto
- * handle <-- 16-byte aligned for call to handle_protocol
- */
-
- movl 12(%ebp), %eax // sys_table
- movl ST32_boottime(%eax), %eax // sys_table->boottime
- call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
- addl $12, %esp // restore argument space
- testl %eax, %eax
- jnz 2f
-
- movl 8(%ebp), %ecx // image_handle
- movl 12(%ebp), %edx // sys_table
- movl -4(%ebp), %esi // loaded_image
- movl LI32_image_base(%esi), %esi // loaded_image->image_base
- movl %ebx, %ebp // startup_32 for efi32_pe_stub_entry
- /*
- * We need to set the image_offset variable here since startup_32() will
- * use it before we get to the 64-bit efi_pe_entry() in C code.
- */
- subl %esi, %ebx
- movl %ebx, rva(image_offset)(%ebp) // save image_offset
- jmp efi32_pe_stub_entry
-
-2: popl %edi // restore callee-save registers
- popl %ebx
- leave
- RET
-SYM_FUNC_END(efi32_pe_entry)
-
- .section ".rodata"
- /* EFI loaded image protocol GUID */
- .balign 4
-SYM_DATA_START_LOCAL(loaded_image_proto)
- .long 0x5b1b31a1
- .word 0x9562, 0x11d2
- .byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
-SYM_DATA_END(loaded_image_proto)
-#endif
-
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- __HEAD
- .code32
-/*
- * Write an IDT entry into boot32_idt
- *
- * Parameters:
- *
- * %eax: Handler address
- * %edx: Vector number
- *
- * Physical offset is expected in %ebp
- */
-SYM_FUNC_START(startup32_set_idt_entry)
- push %ebx
- push %ecx
-
- /* IDT entry address to %ebx */
- leal rva(boot32_idt)(%ebp), %ebx
- shl $3, %edx
- addl %edx, %ebx
-
- /* Build IDT entry, lower 4 bytes */
- movl %eax, %edx
- andl $0x0000ffff, %edx # Target code segment offset [15:0]
- movl $__KERNEL32_CS, %ecx # Target code segment selector
- shl $16, %ecx
- orl %ecx, %edx
-
- /* Store lower 4 bytes to IDT */
- movl %edx, (%ebx)
-
- /* Build IDT entry, upper 4 bytes */
- movl %eax, %edx
- andl $0xffff0000, %edx # Target code segment offset [31:16]
- orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
-
- /* Store upper 4 bytes to IDT */
- movl %edx, 4(%ebx)
-
- pop %ecx
- pop %ebx
- RET
-SYM_FUNC_END(startup32_set_idt_entry)
-#endif
-
-SYM_FUNC_START(startup32_load_idt)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- /* #VC handler */
- leal rva(startup32_vc_handler)(%ebp), %eax
- movl $X86_TRAP_VC, %edx
- call startup32_set_idt_entry
-
- /* Load IDT */
- leal rva(boot32_idt)(%ebp), %eax
- movl %eax, rva(boot32_idt_desc+2)(%ebp)
- lidt rva(boot32_idt_desc)(%ebp)
-#endif
- RET
-SYM_FUNC_END(startup32_load_idt)
-
-/*
- * Check for the correct C-bit position when the startup_32 boot-path is used.
- *
- * The check makes use of the fact that all memory is encrypted when paging is
- * disabled. The function creates 64 bits of random data using the RDRAND
- * instruction. RDRAND is mandatory for SEV guests, so always available. If the
- * hypervisor violates that the kernel will crash right here.
- *
- * The 64 bits of random data are stored to a memory location and at the same
- * time kept in the %eax and %ebx registers. Since encryption is always active
- * when paging is off the random data will be stored encrypted in main memory.
- *
- * Then paging is enabled. When the C-bit position is correct all memory is
- * still mapped encrypted and comparing the register values with memory will
- * succeed. An incorrect C-bit position will map all memory unencrypted, so that
- * the compare will use the encrypted random data and fail.
- */
-SYM_FUNC_START(startup32_check_sev_cbit)
-#ifdef CONFIG_AMD_MEM_ENCRYPT
- pushl %eax
- pushl %ebx
- pushl %ecx
- pushl %edx
-
- /* Check for non-zero sev_status */
- movl rva(sev_status)(%ebp), %eax
- testl %eax, %eax
- jz 4f
-
- /*
- * Get two 32-bit random values - Don't bail out if RDRAND fails
- * because it is better to prevent forward progress if no random value
- * can be gathered.
- */
-1: rdrand %eax
- jnc 1b
-2: rdrand %ebx
- jnc 2b
-
- /* Store to memory and keep it in the registers */
- movl %eax, rva(sev_check_data)(%ebp)
- movl %ebx, rva(sev_check_data+4)(%ebp)
-
- /* Enable paging to see if encryption is active */
- movl %cr0, %edx /* Backup %cr0 in %edx */
- movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
- movl %ecx, %cr0
-
- cmpl %eax, rva(sev_check_data)(%ebp)
- jne 3f
- cmpl %ebx, rva(sev_check_data+4)(%ebp)
- jne 3f
-
- movl %edx, %cr0 /* Restore previous %cr0 */
-
- jmp 4f
-
-3: /* Check failed - hlt the machine */
- hlt
- jmp 3b
-
-4:
- popl %edx
- popl %ecx
- popl %ebx
- popl %eax
-#endif
- RET
-SYM_FUNC_END(startup32_check_sev_cbit)
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
index d4a314cc50d6..bcc956c17872 100644
--- a/arch/x86/boot/compressed/ident_map_64.c
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -8,14 +8,6 @@
* Copyright (C) 2016 Kees Cook
*/
-/*
- * Since we're dealing with identity mappings, physical and virtual
- * addresses are the same, so override these defines which are ultimately
- * used by the headers in misc.h.
- */
-#define __pa(x) ((unsigned long)(x))
-#define __va(x) ((void *)((unsigned long)(x)))
-
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
@@ -180,6 +172,12 @@ void initialize_identity_maps(void *rmode)
/* Load the new page-table. */
write_cr3(top_level_pgt);
+
+ /*
+ * Now that the required page table mappings are established and a
+ * GHCB can be used, check for SNP guest/HV feature compatibility.
+ */
+ snp_check_features();
}
static pte_t *split_large_pmd(struct x86_mapping_info *info,
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index e476bcbd9b42..454757fbdfe5 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -668,7 +668,7 @@ static bool process_mem_region(struct mem_vector *region,
}
}
#endif
- return 0;
+ return false;
}
#ifdef CONFIG_EFI
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index a73e4d783cae..32f7cc8a8625 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -12,16 +12,13 @@
#include <asm/processor-flags.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
.text
.code32
SYM_FUNC_START(get_sev_encryption_bit)
- xor %eax, %eax
-
-#ifdef CONFIG_AMD_MEM_ENCRYPT
push %ebx
- push %ecx
- push %edx
movl $0x80000000, %eax /* CPUID to check the highest leaf */
cpuid
@@ -52,12 +49,7 @@ SYM_FUNC_START(get_sev_encryption_bit)
xor %eax, %eax
.Lsev_exit:
- pop %edx
- pop %ecx
pop %ebx
-
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
-
RET
SYM_FUNC_END(get_sev_encryption_bit)
@@ -98,7 +90,7 @@ SYM_CODE_START_LOCAL(sev_es_req_cpuid)
jmp 1b
SYM_CODE_END(sev_es_req_cpuid)
-SYM_CODE_START(startup32_vc_handler)
+SYM_CODE_START_LOCAL(startup32_vc_handler)
pushl %eax
pushl %ebx
pushl %ecx
@@ -184,15 +176,149 @@ SYM_CODE_START(startup32_vc_handler)
jmp .Lfail
SYM_CODE_END(startup32_vc_handler)
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ * %ecx: IDT address
+ */
+SYM_FUNC_START_LOCAL(startup32_set_idt_entry)
+ /* IDT entry address to %ecx */
+ leal (%ecx, %edx, 8), %ecx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ orl $(__KERNEL32_CS << 16), %edx # Target code segment selector
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ecx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ecx)
+
+ RET
+SYM_FUNC_END(startup32_set_idt_entry)
+
+SYM_FUNC_START(startup32_load_idt)
+ push %ebp
+ push %ebx
+
+ call 1f
+1: pop %ebp
+
+ leal (boot32_idt - 1b)(%ebp), %ebx
+
+ /* #VC handler */
+ leal (startup32_vc_handler - 1b)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ movl %ebx, %ecx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal (boot32_idt_desc - 1b)(%ebp), %ecx
+ movl %ebx, 2(%ecx)
+ lidt (%ecx)
+
+ pop %ebx
+ pop %ebp
+ RET
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+ pushl %ebx
+ pushl %ebp
+
+ call 0f
+0: popl %ebp
+
+ /* Check for non-zero sev_status */
+ movl (sev_status - 0b)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ leal (sev_check_data - 0b)(%ebp), %ebp
+ movl %eax, 0(%ebp)
+ movl %ebx, 4(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, 0(%ebp)
+ jne 3f
+ cmpl %ebx, 4(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %ebp
+ popl %ebx
+ RET
+SYM_FUNC_END(startup32_check_sev_cbit)
+
.code64
#include "../../kernel/sev_verify_cbit.S"
.data
-#ifdef CONFIG_AMD_MEM_ENCRYPT
.balign 8
SYM_DATA(sme_me_mask, .quad 0)
SYM_DATA(sev_status, .quad 0)
SYM_DATA(sev_check_data, .quad 0)
-#endif
+
+SYM_DATA_START_LOCAL(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END(boot32_idt)
+
+SYM_DATA_START_LOCAL(boot32_idt_desc)
+ .word . - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index cf690d8712f4..014ff222bf4b 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -277,7 +277,7 @@ static inline void handle_relocations(void *output, unsigned long output_len,
{ }
#endif
-static void parse_elf(void *output)
+static size_t parse_elf(void *output)
{
#ifdef CONFIG_X86_64
Elf64_Ehdr ehdr;
@@ -293,10 +293,8 @@ static void parse_elf(void *output)
if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
- ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ ehdr.e_ident[EI_MAG3] != ELFMAG3)
error("Kernel is not a valid ELF file");
- return;
- }
debug_putstr("Parsing ELF... ");
@@ -328,6 +326,8 @@ static void parse_elf(void *output)
}
free(phdrs);
+
+ return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
/*
@@ -356,6 +356,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
unsigned long needed_size;
+ size_t entry_offset;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -456,14 +457,17 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
debug_putstr("\nDecompressing Linux... ");
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
- parse_elf(output);
+ entry_offset = parse_elf(output);
handle_relocations(output, output_len, virt_addr);
- debug_putstr("done.\nBooting the kernel.\n");
+
+ debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
+ debug_puthex(entry_offset);
+ debug_putstr(").\n");
/* Disable exception handling before booting the kernel */
cleanup_exception_handling();
- return output;
+ return output + entry_offset;
}
void fortify_panic(const char *name)
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 62208ec04ca4..2f155a0e3041 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -19,6 +19,15 @@
/* cpu_feature_enabled() cannot be used this early */
#define USE_EARLY_PGTABLE_L5
+/*
+ * Boot stub deals with identity mappings, physical and virtual addresses are
+ * the same, so override these defines.
+ *
+ * <asm/page.h> will not define them if they are already defined.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
#include <linux/linkage.h>
#include <linux/screen_info.h>
#include <linux/elf.h>
@@ -126,6 +135,7 @@ static inline void console_init(void)
#ifdef CONFIG_AMD_MEM_ENCRYPT
void sev_enable(struct boot_params *bp);
+void snp_check_features(void);
void sev_es_shutdown_ghcb(void);
extern bool sev_es_check_ghcb_fault(unsigned long address);
void snp_set_page_private(unsigned long paddr);
@@ -143,6 +153,7 @@ static inline void sev_enable(struct boot_params *bp)
if (bp)
bp->cc_blob_address = 0;
}
+static inline void snp_check_features(void) { }
static inline void sev_es_shutdown_ghcb(void) { }
static inline bool sev_es_check_ghcb_fault(unsigned long address)
{
diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c
index c93930d5ccbd..014b89c89088 100644
--- a/arch/x86/boot/compressed/sev.c
+++ b/arch/x86/boot/compressed/sev.c
@@ -104,9 +104,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
}
#undef __init
-#undef __pa
#define __init
-#define __pa(x) ((unsigned long)(x))
#define __BOOT_COMPRESSED
@@ -208,6 +206,23 @@ void sev_es_shutdown_ghcb(void)
error("Can't unmap GHCB page");
}
+static void __noreturn sev_es_ghcb_terminate(struct ghcb *ghcb, unsigned int set,
+ unsigned int reason, u64 exit_info_2)
+{
+ u64 exit_info_1 = SVM_VMGEXIT_TERM_REASON(set, reason);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_TERM_REQUEST);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
bool sev_es_check_ghcb_fault(unsigned long address)
{
/* Check whether the fault was on the GHCB page */
@@ -270,6 +285,59 @@ static void enforce_vmpl0(void)
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0);
}
+/*
+ * SNP_FEATURES_IMPL_REQ is the mask of SNP features that will need
+ * guest side implementation for proper functioning of the guest. If any
+ * of these features are enabled in the hypervisor but are lacking guest
+ * side implementation, the behavior of the guest will be undefined. The
+ * guest could fail in non-obvious way making it difficult to debug.
+ *
+ * As the behavior of reserved feature bits is unknown to be on the
+ * safe side add them to the required features mask.
+ */
+#define SNP_FEATURES_IMPL_REQ (MSR_AMD64_SNP_VTOM | \
+ MSR_AMD64_SNP_REFLECT_VC | \
+ MSR_AMD64_SNP_RESTRICTED_INJ | \
+ MSR_AMD64_SNP_ALT_INJ | \
+ MSR_AMD64_SNP_DEBUG_SWAP | \
+ MSR_AMD64_SNP_VMPL_SSS | \
+ MSR_AMD64_SNP_SECURE_TSC | \
+ MSR_AMD64_SNP_VMGEXIT_PARAM | \
+ MSR_AMD64_SNP_VMSA_REG_PROTECTION | \
+ MSR_AMD64_SNP_RESERVED_BIT13 | \
+ MSR_AMD64_SNP_RESERVED_BIT15 | \
+ MSR_AMD64_SNP_RESERVED_MASK)
+
+/*
+ * SNP_FEATURES_PRESENT is the mask of SNP features that are implemented
+ * by the guest kernel. As and when a new feature is implemented in the
+ * guest kernel, a corresponding bit should be added to the mask.
+ */
+#define SNP_FEATURES_PRESENT (0)
+
+void snp_check_features(void)
+{
+ u64 unsupported;
+
+ if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
+ return;
+
+ /*
+ * Terminate the boot if hypervisor has enabled any feature lacking
+ * guest side implementation. Pass on the unsupported features mask through
+ * EXIT_INFO_2 of the GHCB protocol so that those features can be reported
+ * as part of the guest boot failure.
+ */
+ unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
+ if (unsupported) {
+ if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
+ sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
+
+ sev_es_ghcb_terminate(boot_ghcb, SEV_TERM_SET_GEN,
+ GHCB_SNP_UNSUPPORTED, unsupported);
+ }
+}
+
void sev_enable(struct boot_params *bp)
{
unsigned int eax, ebx, ecx, edx;
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
index 918a7606f53c..2d81d3cc72a1 100644
--- a/arch/x86/boot/compressed/tdx.c
+++ b/arch/x86/boot/compressed/tdx.c
@@ -26,7 +26,7 @@ static inline unsigned int tdx_io_in(int size, u16 port)
.r14 = port,
};
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return UINT_MAX;
return args.r11;
@@ -43,7 +43,7 @@ static inline void tdx_io_out(int size, u16 port, u32 value)
.r15 = value,
};
- __tdx_hypercall(&args, 0);
+ __tdx_hypercall(&args);
}
static inline u8 tdx_inb(u16 port)
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 112b2375d021..b22f34b8684a 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -34,6 +34,7 @@ SECTIONS
_text = .; /* Text */
*(.text)
*(.text.*)
+ *(.noinstr.text)
_etext = . ;
}
.rodata : {
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index a83d67ec627d..d75237ba7ce9 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -64,20 +64,11 @@ int has_eflag(unsigned long mask)
return !!((f0^f1) & mask);
}
-/* Handle x86_32 PIC using ebx. */
-#if defined(__i386__) && defined(__PIC__)
-# define EBX_REG "=r"
-#else
-# define EBX_REG "=b"
-#endif
-
void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d)
{
- asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
- "cpuid \n\t"
- ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
- : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
- : "a" (id), "c" (count)
+ asm volatile("cpuid"
+ : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+ : "0" (id), "2" (count)
);
}
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index f912d7770130..b04ca8e2b213 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -80,10 +80,11 @@ bs_die:
ljmp $0xf000,$0xfff0
#ifdef CONFIG_EFI_STUB
- .org 0x3c
+ .org 0x38
#
# Offset to the PE header.
#
+ .long LINUX_PE_MAGIC
.long pe_header
#endif /* CONFIG_EFI_STUB */
@@ -320,7 +321,7 @@ start_sys_seg: .word SYSSEG # obsolete and meaningless, but just
type_of_loader: .byte 0 # 0 means ancient bootloader, newer
# bootloaders know to change this.
- # See Documentation/x86/boot.rst for
+ # See Documentation/arch/x86/boot.rst for
# assigned ids
# flags, unused bits must be zero (RFU) bit within loadflags
@@ -406,7 +407,7 @@ xloadflags:
# define XLF1 0
#endif
-#ifdef CONFIG_EFI_STUB
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
# ifdef CONFIG_EFI_MIXED
# define XLF23 (XLF_EFI_HANDOVER_32|XLF_EFI_HANDOVER_64)
# else
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 8a3fff9128bb..1c8541ae3b3a 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -350,7 +350,7 @@ static int _kstrtoul(const char *s, unsigned int base, unsigned long *res)
}
/**
- * kstrtoul - convert a string to an unsigned long
+ * boot_kstrtoul - convert a string to an unsigned long
* @s: The start of the string. The string must be null-terminated, and may also
* include a single newline before its terminating null. The first character
* may also be a plus sign, but not a minus sign.
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a3725ad46c5a..bd247692b701 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -290,6 +290,7 @@ static void efi_stub_entry_update(void)
{
unsigned long addr = efi32_stub_entry;
+#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
#ifdef CONFIG_X86_64
/* Yes, this is really how we defined it :( */
addr = efi64_stub_entry - 0x200;
@@ -299,6 +300,7 @@ static void efi_stub_entry_update(void)
if (efi32_stub_entry != addr)
die("32-bit and 64-bit EFI entry points do not match\n");
#endif
+#endif
put_unaligned_le32(addr, &buf[0x264]);
}
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 49b44f881484..73f83233d25d 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -13,7 +13,7 @@
#include <asm/coco.h>
#include <asm/processor.h>
-static enum cc_vendor vendor __ro_after_init;
+enum cc_vendor cc_vendor __ro_after_init;
static u64 cc_mask __ro_after_init;
static bool intel_cc_platform_has(enum cc_attr attr)
@@ -30,6 +30,22 @@ static bool intel_cc_platform_has(enum cc_attr attr)
}
/*
+ * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and
+ * the other levels of SME/SEV functionality, including C-bit
+ * based SEV-SNP, are not enabled.
+ */
+static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
* SME and SEV are very similar but they are not the same, so there are
* times that the kernel will need to distinguish between SME and SEV. The
* cc_platform_has() function is used for this. When a distinction isn't
@@ -41,9 +57,14 @@ static bool intel_cc_platform_has(enum cc_attr attr)
* up under SME the trampoline area cannot be encrypted, whereas under SEV
* the trampoline area must be encrypted.
*/
+
static bool amd_cc_platform_has(enum cc_attr attr)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return amd_cc_platform_vtom(attr);
+
switch (attr) {
case CC_ATTR_MEM_ENCRYPT:
return sme_me_mask;
@@ -76,20 +97,13 @@ static bool amd_cc_platform_has(enum cc_attr attr)
#endif
}
-static bool hyperv_cc_platform_has(enum cc_attr attr)
-{
- return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
-}
-
bool cc_platform_has(enum cc_attr attr)
{
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
return amd_cc_platform_has(attr);
case CC_VENDOR_INTEL:
return intel_cc_platform_has(attr);
- case CC_VENDOR_HYPERV:
- return hyperv_cc_platform_has(attr);
default:
return false;
}
@@ -103,11 +117,14 @@ u64 cc_mkenc(u64 val)
* encryption status of the page.
*
* - for AMD, bit *set* means the page is encrypted
- * - for Intel *clear* means encrypted.
+ * - for AMD with vTOM and for Intel, *clear* means encrypted
*/
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
- return val | cc_mask;
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val & ~cc_mask;
+ else
+ return val | cc_mask;
case CC_VENDOR_INTEL:
return val & ~cc_mask;
default:
@@ -118,9 +135,12 @@ u64 cc_mkenc(u64 val)
u64 cc_mkdec(u64 val)
{
/* See comment in cc_mkenc() */
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
- return val & ~cc_mask;
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val | cc_mask;
+ else
+ return val & ~cc_mask;
case CC_VENDOR_INTEL:
return val | cc_mask;
default:
@@ -129,11 +149,6 @@ u64 cc_mkdec(u64 val)
}
EXPORT_SYMBOL_GPL(cc_mkdec);
-__init void cc_set_vendor(enum cc_vendor v)
-{
- vendor = v;
-}
-
__init void cc_set_mask(u64 mask)
{
cc_mask = mask;
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index f9eb1134f22d..b193c0a1d8db 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -13,6 +13,12 @@
/*
* Bitmasks of exposed registers (with VMM).
*/
+#define TDX_RDX BIT(2)
+#define TDX_RBX BIT(3)
+#define TDX_RSI BIT(6)
+#define TDX_RDI BIT(7)
+#define TDX_R8 BIT(8)
+#define TDX_R9 BIT(9)
#define TDX_R10 BIT(10)
#define TDX_R11 BIT(11)
#define TDX_R12 BIT(12)
@@ -27,9 +33,11 @@
* details can be found in TDX GHCI specification, section
* titled "TDCALL [TDG.VP.VMCALL] leaf".
*/
-#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \
- TDX_R12 | TDX_R13 | \
- TDX_R14 | TDX_R15 )
+#define TDVMCALL_EXPOSE_REGS_MASK \
+ ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
+ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 )
+
+.section .noinstr.text, "ax"
/*
* __tdx_module_call() - Used by TDX guests to request services from
@@ -77,12 +85,12 @@ SYM_FUNC_START(__tdx_module_call)
SYM_FUNC_END(__tdx_module_call)
/*
- * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf
- * of TDCALL instruction
+ * TDX_HYPERCALL - Make hypercalls to a TDX VMM using TDVMCALL leaf of TDCALL
+ * instruction
*
* Transforms values in function call argument struct tdx_hypercall_args @args
* into the TDCALL register ABI. After TDCALL operation, VMM output is saved
- * back in @args.
+ * back in @args, if \ret is 1.
*
*-------------------------------------------------------------------------
* TD VMCALL ABI:
@@ -97,26 +105,18 @@ SYM_FUNC_END(__tdx_module_call)
* specification. Non zero value indicates vendor
* specific ABI.
* R11 - VMCALL sub function number
- * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments.
+ * RBX, RDX, RDI, RSI - Used to pass VMCALL sub function specific arguments.
* R8-R9, R12-R15 - Same as above.
*
* Output Registers:
*
* RAX - TDCALL instruction status (Not related to hypercall
* output).
- * R10 - Hypercall output error code.
- * R11-R15 - Hypercall sub function specific output values.
- *
- *-------------------------------------------------------------------------
- *
- * __tdx_hypercall() function ABI:
+ * RBX, RDX, RDI, RSI - Hypercall sub function specific output values.
+ * R8-R15 - Same as above.
*
- * @args (RDI) - struct tdx_hypercall_args for input and output
- * @flags (RSI) - TDX_HCALL_* flags
- *
- * On successful completion, return the hypercall error code.
*/
-SYM_FUNC_START(__tdx_hypercall)
+.macro TDX_HYPERCALL ret:req
FRAME_BEGIN
/* Save callee-saved GPRs as mandated by the x86_64 ABI */
@@ -124,38 +124,37 @@ SYM_FUNC_START(__tdx_hypercall)
push %r14
push %r13
push %r12
+ push %rbx
+
+ /* Free RDI to be used as TDVMCALL arguments */
+ movq %rdi, %rax
+
+ /* Copy hypercall registers from arg struct: */
+ movq TDX_HYPERCALL_r8(%rax), %r8
+ movq TDX_HYPERCALL_r9(%rax), %r9
+ movq TDX_HYPERCALL_r10(%rax), %r10
+ movq TDX_HYPERCALL_r11(%rax), %r11
+ movq TDX_HYPERCALL_r12(%rax), %r12
+ movq TDX_HYPERCALL_r13(%rax), %r13
+ movq TDX_HYPERCALL_r14(%rax), %r14
+ movq TDX_HYPERCALL_r15(%rax), %r15
+ movq TDX_HYPERCALL_rdi(%rax), %rdi
+ movq TDX_HYPERCALL_rsi(%rax), %rsi
+ movq TDX_HYPERCALL_rbx(%rax), %rbx
+ movq TDX_HYPERCALL_rdx(%rax), %rdx
+
+ push %rax
/* Mangle function call ABI into TDCALL ABI: */
/* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
xor %eax, %eax
- /* Copy hypercall registers from arg struct: */
- movq TDX_HYPERCALL_r10(%rdi), %r10
- movq TDX_HYPERCALL_r11(%rdi), %r11
- movq TDX_HYPERCALL_r12(%rdi), %r12
- movq TDX_HYPERCALL_r13(%rdi), %r13
- movq TDX_HYPERCALL_r14(%rdi), %r14
- movq TDX_HYPERCALL_r15(%rdi), %r15
-
movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
- /*
- * For the idle loop STI needs to be called directly before the TDCALL
- * that enters idle (EXIT_REASON_HLT case). STI instruction enables
- * interrupts only one instruction later. If there is a window between
- * STI and the instruction that emulates the HALT state, there is a
- * chance for interrupts to happen in this window, which can delay the
- * HLT operation indefinitely. Since this is the not the desired
- * result, conditionally call STI before TDCALL.
- */
- testq $TDX_HCALL_ISSUE_STI, %rsi
- jz .Lskip_sti
- sti
-.Lskip_sti:
tdcall
/*
- * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that
+ * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that
* something has gone horribly wrong with the TDX module.
*
* The return status of the hypercall operation is in a separate
@@ -163,32 +162,43 @@ SYM_FUNC_START(__tdx_hypercall)
* and are handled by callers.
*/
testq %rax, %rax
- jne .Lpanic
+ jne .Lpanic\@
+
+ pop %rax
+
+ .if \ret
+ movq %r8, TDX_HYPERCALL_r8(%rax)
+ movq %r9, TDX_HYPERCALL_r9(%rax)
+ movq %r10, TDX_HYPERCALL_r10(%rax)
+ movq %r11, TDX_HYPERCALL_r11(%rax)
+ movq %r12, TDX_HYPERCALL_r12(%rax)
+ movq %r13, TDX_HYPERCALL_r13(%rax)
+ movq %r14, TDX_HYPERCALL_r14(%rax)
+ movq %r15, TDX_HYPERCALL_r15(%rax)
+ movq %rdi, TDX_HYPERCALL_rdi(%rax)
+ movq %rsi, TDX_HYPERCALL_rsi(%rax)
+ movq %rbx, TDX_HYPERCALL_rbx(%rax)
+ movq %rdx, TDX_HYPERCALL_rdx(%rax)
+ .endif
/* TDVMCALL leaf return code is in R10 */
movq %r10, %rax
- /* Copy hypercall result registers to arg struct if needed */
- testq $TDX_HCALL_HAS_OUTPUT, %rsi
- jz .Lout
-
- movq %r10, TDX_HYPERCALL_r10(%rdi)
- movq %r11, TDX_HYPERCALL_r11(%rdi)
- movq %r12, TDX_HYPERCALL_r12(%rdi)
- movq %r13, TDX_HYPERCALL_r13(%rdi)
- movq %r14, TDX_HYPERCALL_r14(%rdi)
- movq %r15, TDX_HYPERCALL_r15(%rdi)
-.Lout:
/*
* Zero out registers exposed to the VMM to avoid speculative execution
* with VMM-controlled values. This needs to include all registers
- * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
- * context will be restored.
+ * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which
+ * will be restored.
*/
+ xor %r8d, %r8d
+ xor %r9d, %r9d
xor %r10d, %r10d
xor %r11d, %r11d
+ xor %rdi, %rdi
+ xor %rdx, %rdx
/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ pop %rbx
pop %r12
pop %r13
pop %r14
@@ -197,9 +207,33 @@ SYM_FUNC_START(__tdx_hypercall)
FRAME_END
RET
-.Lpanic:
+.Lpanic\@:
call __tdx_hypercall_failed
/* __tdx_hypercall_failed never returns */
REACHABLE
- jmp .Lpanic
+ jmp .Lpanic\@
+.endm
+
+/*
+ *
+ * __tdx_hypercall() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall)
+ TDX_HYPERCALL ret=0
SYM_FUNC_END(__tdx_hypercall)
+
+/*
+ *
+ * __tdx_hypercall_ret() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input and output
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall_ret)
+ TDX_HYPERCALL ret=1
+SYM_FUNC_END(__tdx_hypercall_ret)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index b8998cf0508a..e146b599260f 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -5,6 +5,8 @@
#define pr_fmt(fmt) "tdx: " fmt
#include <linux/cpufeature.h>
+#include <linux/export.h>
+#include <linux/io.h>
#include <asm/coco.h>
#include <asm/tdx.h>
#include <asm/vmx.h>
@@ -15,10 +17,16 @@
/* TDX module Call Leaf IDs */
#define TDX_GET_INFO 1
#define TDX_GET_VEINFO 3
+#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
+#define TDX_WR 8
+
+/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
+#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
+#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
/* MMIO direction */
#define EPT_READ 0
@@ -34,8 +42,15 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+#define ATTR_DEBUG BIT(0)
#define ATTR_SEPT_VE_DISABLE BIT(28)
+/* TDX Module call error codes */
+#define TDCALL_RETURN_CODE(a) ((a) >> 32)
+#define TDCALL_INVALID_OPERAND 0xc0000100
+
+#define TDREPORT_SUBTYPE_0 0
+
/*
* Wrapper for standard use of __tdx_hypercall with no output aside from
* return code.
@@ -51,12 +66,13 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
.r15 = r15,
};
- return __tdx_hypercall(&args, 0);
+ return __tdx_hypercall(&args);
}
/* Called from __tdx_hypercall() for unrecoverable failure */
-void __tdx_hypercall_failed(void)
+noinstr void __tdx_hypercall_failed(void)
{
+ instrumentation_begin();
panic("TDVMCALL failed. TDX module bug?");
}
@@ -66,7 +82,7 @@ void __tdx_hypercall_failed(void)
* Reusing the KVM EXIT_REASON macros makes it easier to connect the host and
* guest sides of these calls.
*/
-static u64 hcall_func(u64 exit_reason)
+static __always_inline u64 hcall_func(u64 exit_reason)
{
return exit_reason;
}
@@ -83,7 +99,7 @@ long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
.r14 = p4,
};
- return __tdx_hypercall(&args, 0);
+ return __tdx_hypercall(&args);
}
EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
#endif
@@ -100,6 +116,72 @@ static inline void tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9,
panic("TDCALL %lld failed (Buggy TDX module!)\n", fn);
}
+/**
+ * tdx_mcall_get_report0() - Wrapper to get TDREPORT0 (a.k.a. TDREPORT
+ * subtype 0) using TDG.MR.REPORT TDCALL.
+ * @reportdata: Address of the input buffer which contains user-defined
+ * REPORTDATA to be included into TDREPORT.
+ * @tdreport: Address of the output buffer to store TDREPORT.
+ *
+ * Refer to section titled "TDG.MR.REPORT leaf" in the TDX Module
+ * v1.0 specification for more information on TDG.MR.REPORT TDCALL.
+ * It is used in the TDX guest driver module to get the TDREPORT0.
+ *
+ * Return 0 on success, -EINVAL for invalid operands, or -EIO on
+ * other TDCALL failures.
+ */
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
+{
+ u64 ret;
+
+ ret = __tdx_module_call(TDX_GET_REPORT, virt_to_phys(tdreport),
+ virt_to_phys(reportdata), TDREPORT_SUBTYPE_0,
+ 0, NULL);
+ if (ret) {
+ if (TDCALL_RETURN_CODE(ret) == TDCALL_INVALID_OPERAND)
+ return -EINVAL;
+ return -EIO;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+
+static void __noreturn tdx_panic(const char *msg)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_REPORT_FATAL_ERROR,
+ .r12 = 0, /* Error code: 0 is Panic */
+ };
+ union {
+ /* Define register order according to the GHCI */
+ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
+
+ char str[64];
+ } message;
+
+ /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
+ strncpy(message.str, msg, 64);
+
+ args.r8 = message.r8;
+ args.r9 = message.r9;
+ args.r14 = message.r14;
+ args.r15 = message.r15;
+ args.rdi = message.rdi;
+ args.rsi = message.rsi;
+ args.rbx = message.rbx;
+ args.rdx = message.rdx;
+
+ /*
+ * This hypercall should never return and it is not safe
+ * to keep the guest running. Call it forever if it
+ * happens to return.
+ */
+ while (1)
+ __tdx_hypercall(&args);
+}
+
static void tdx_parse_tdinfo(u64 *cc_mask)
{
struct tdx_module_output out;
@@ -131,8 +213,15 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
td_attr = out.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE))
- panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
+ if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
+ const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
+
+ /* Relax SEPT_VE_DISABLE check for debug TD. */
+ if (td_attr & ATTR_DEBUG)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ }
}
/*
@@ -180,7 +269,7 @@ static int ve_instr_len(struct ve_info *ve)
}
}
-static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
+static u64 __cpuidle __halt(const bool irq_disabled)
{
struct tdx_hypercall_args args = {
.r10 = TDX_HYPERCALL_STANDARD,
@@ -200,20 +289,14 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
* can keep the vCPU in virtual HLT, even if an IRQ is
* pending, without hanging/breaking the guest.
*/
- return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0);
+ return __tdx_hypercall(&args);
}
static int handle_halt(struct ve_info *ve)
{
- /*
- * Since non safe halt is mainly used in CPU offlining
- * and the guest will always stay in the halt state, don't
- * call the STI instruction (set do_sti as false).
- */
const bool irq_disabled = irqs_disabled();
- const bool do_sti = false;
- if (__halt(irq_disabled, do_sti))
+ if (__halt(irq_disabled))
return -EIO;
return ve_instr_len(ve);
@@ -221,18 +304,12 @@ static int handle_halt(struct ve_info *ve)
void __cpuidle tdx_safe_halt(void)
{
- /*
- * For do_sti=true case, __tdx_hypercall() function enables
- * interrupts using the STI instruction before the TDCALL. So
- * set irq_disabled as false.
- */
const bool irq_disabled = false;
- const bool do_sti = true;
/*
* Use WARN_ONCE() to report the failure.
*/
- if (__halt(irq_disabled, do_sti))
+ if (__halt(irq_disabled))
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
@@ -249,7 +326,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return -EIO;
regs->ax = lower_32_bits(args.r11);
@@ -271,7 +348,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
*/
- if (__tdx_hypercall(&args, 0))
+ if (__tdx_hypercall(&args))
return -EIO;
return ve_instr_len(ve);
@@ -303,7 +380,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
* ABI can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return -EIO;
/*
@@ -330,7 +407,7 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r15 = *val,
};
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return false;
*val = args.r11;
return true;
@@ -346,8 +423,8 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
{
unsigned long *reg, val, vaddr;
char buffer[MAX_INSN_SIZE];
+ enum insn_mmio_type mmio;
struct insn insn = {};
- enum mmio_type mmio;
int size, extend_size;
u8 extend_val = 0;
@@ -362,10 +439,10 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EINVAL;
mmio = insn_decode_mmio(&insn, &size);
- if (WARN_ON_ONCE(mmio == MMIO_DECODE_FAILED))
+ if (WARN_ON_ONCE(mmio == INSN_MMIO_DECODE_FAILED))
return -EINVAL;
- if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
reg = insn_get_modrm_reg_ptr(&insn, regs);
if (!reg)
return -EINVAL;
@@ -386,23 +463,23 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
/* Handle writes first */
switch (mmio) {
- case MMIO_WRITE:
+ case INSN_MMIO_WRITE:
memcpy(&val, reg, size);
if (!mmio_write(size, ve->gpa, val))
return -EIO;
return insn.length;
- case MMIO_WRITE_IMM:
+ case INSN_MMIO_WRITE_IMM:
val = insn.immediate.value;
if (!mmio_write(size, ve->gpa, val))
return -EIO;
return insn.length;
- case MMIO_READ:
- case MMIO_READ_ZERO_EXTEND:
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ:
+ case INSN_MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
/* Reads are handled below */
break;
- case MMIO_MOVS:
- case MMIO_DECODE_FAILED:
+ case INSN_MMIO_MOVS:
+ case INSN_MMIO_DECODE_FAILED:
/*
* MMIO was accessed with an instruction that could not be
* decoded or handled properly. It was likely not using io.h
@@ -419,15 +496,15 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EIO;
switch (mmio) {
- case MMIO_READ:
+ case INSN_MMIO_READ:
/* Zero-extend for 32-bit operation */
extend_size = size == 4 ? sizeof(*reg) : 0;
break;
- case MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_ZERO_EXTEND:
/* Zero extend based on operand size */
extend_size = insn.opnd_bytes;
break;
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
/* Sign extend based on operand size */
extend_size = insn.opnd_bytes;
if (size == 1 && val & BIT(7))
@@ -464,7 +541,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
* in TDX Guest-Host-Communication Interface (GHCI) section titled
* "TDG.VP.VMCALL<Instruction.IO>".
*/
- success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
+ success = !__tdx_hypercall_ret(&args);
/* Update part of the register affected by the emulated instruction */
regs->ax &= ~mask;
@@ -588,6 +665,11 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
}
}
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
/*
* Handle the kernel #VE.
*
@@ -606,6 +688,8 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
case EXIT_REASON_CPUID:
return handle_cpuid(regs, ve);
case EXIT_REASON_EPT_VIOLATION:
+ if (is_private_gpa(ve->gpa))
+ panic("Unexpected EPT-violation on private memory.");
return handle_mmio(regs, ve);
case EXIT_REASON_IO_INSTRUCTION:
return handle_io(regs, ve);
@@ -772,6 +856,9 @@ void __init tdx_early_init(void)
tdx_parse_tdinfo(&cc_mask);
cc_set_mask(cc_mask);
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+
/*
* All bits above GPA width are reserved and kernel treats shared bit
* as flag, not as part of physical address.
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 71c4c473d34b..9bbfd01cfa2f 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -304,6 +304,44 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64
Processes 16 blocks in parallel.
+config CRYPTO_ARIA_AESNI_AVX2_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)"
+ depends on X86 && 64BIT
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AES-NI (AES New Instructions)
+ - AVX2 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 32 blocks in parallel.
+
+config CRYPTO_ARIA_GFNI_AVX512_X86_64
+ tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)"
+ depends on X86 && 64BIT && AS_AVX512 && AS_GFNI
+ select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
+ select CRYPTO_ALGAPI
+ select CRYPTO_ARIA
+ select CRYPTO_ARIA_AESNI_AVX_X86_64
+ select CRYPTO_ARIA_AESNI_AVX2_X86_64
+ help
+ Length-preserving cipher: ARIA cipher algorithms
+ (RFC 5794) with ECB and CTR modes
+
+ Architecture: x86_64 using:
+ - AVX512 (Advanced Vector Extensions)
+ - GFNI (Galois Field New Instructions)
+
+ Processes 64 blocks in parallel.
+
config CRYPTO_CHACHA20_X86_64
tristate "Ciphers: ChaCha20, XChaCha20, XChaCha12 (SSSE3/AVX2/AVX-512VL)"
depends on X86 && 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3b1d701a4f6c..9aa46093c91b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -103,7 +103,16 @@ sm4-aesni-avx2-x86_64-y := sm4-aesni-avx2-asm_64.o sm4_aesni_avx2_glue.o
obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) += aria-aesni-avx-x86_64.o
aria-aesni-avx-x86_64-y := aria-aesni-avx-asm_64.o aria_aesni_avx_glue.o
+obj-$(CONFIG_CRYPTO_ARIA_AESNI_AVX2_X86_64) += aria-aesni-avx2-x86_64.o
+aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
+
+obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
+aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
+
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
$(call if_changed,perlasm)
+
+# Disable GCOV in odd or sensitive code
+GCOV_PROFILE_curve25519-x86_64.o := n
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index b48ddebb4748..ad7f4c891625 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -7,6 +7,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define STATE0 %xmm0
@@ -200,8 +201,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_init)
movdqa KEY, STATE4
/* load the constants: */
- movdqa .Laegis128_const_0, STATE2
- movdqa .Laegis128_const_1, STATE1
+ movdqa .Laegis128_const_0(%rip), STATE2
+ movdqa .Laegis128_const_1(%rip), STATE1
pxor STATE2, STATE3
pxor STATE1, STATE4
@@ -402,7 +403,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_ad)
* void crypto_aegis128_aesni_enc(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
FRAME_BEGIN
cmp $0x10, LEN
@@ -499,7 +500,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc)
* void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_enc_tail)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
FRAME_BEGIN
/* load the state: */
@@ -556,7 +557,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
* void crypto_aegis128_aesni_dec(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
FRAME_BEGIN
cmp $0x10, LEN
@@ -653,7 +654,7 @@ SYM_FUNC_END(crypto_aegis128_aesni_dec)
* void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
* const void *src, void *dst);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
+SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
FRAME_BEGIN
/* load the state: */
@@ -681,7 +682,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_dec_tail)
punpcklbw T0, T0
punpcklbw T0, T0
punpcklbw T0, T0
- movdqa .Laegis128_counter, T1
+ movdqa .Laegis128_counter(%rip), T1
pcmpgtb T1, T0
pand T0, MSG
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 837c1e0aa021..3ac7487ecad2 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -288,53 +288,53 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff
# Encrypt/Decrypt first few blocks
and $(3<<4), %r12
- jz _initial_num_blocks_is_0_\@
+ jz .L_initial_num_blocks_is_0_\@
cmp $(2<<4), %r12
- jb _initial_num_blocks_is_1_\@
- je _initial_num_blocks_is_2_\@
-_initial_num_blocks_is_3_\@:
+ jb .L_initial_num_blocks_is_1_\@
+ je .L_initial_num_blocks_is_2_\@
+.L_initial_num_blocks_is_3_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
sub $48, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_2_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_2_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
sub $32, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_1_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_1_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
sub $16, %r13
- jmp _initial_blocks_\@
-_initial_num_blocks_is_0_\@:
+ jmp .L_initial_blocks_\@
+.L_initial_num_blocks_is_0_\@:
INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-_initial_blocks_\@:
+.L_initial_blocks_\@:
# Main loop - Encrypt/Decrypt remaining blocks
test %r13, %r13
- je _zero_cipher_left_\@
+ je .L_zero_cipher_left_\@
sub $64, %r13
- je _four_cipher_left_\@
-_crypt_by_4_\@:
+ je .L_four_cipher_left_\@
+.L_crypt_by_4_\@:
GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
%xmm7, %xmm8, enc
add $64, %r11
sub $64, %r13
- jne _crypt_by_4_\@
-_four_cipher_left_\@:
+ jne .L_crypt_by_4_\@
+.L_four_cipher_left_\@:
GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-_zero_cipher_left_\@:
+.L_zero_cipher_left_\@:
movdqu %xmm8, AadHash(%arg2)
movdqu %xmm0, CurCount(%arg2)
mov %arg5, %r13
and $15, %r13 # %r13 = arg5 (mod 16)
- je _multiple_of_16_bytes_\@
+ je .L_multiple_of_16_bytes_\@
mov %r13, PBlockLen(%arg2)
@@ -348,14 +348,14 @@ _zero_cipher_left_\@:
movdqu %xmm0, PBlockEncKey(%arg2)
cmp $16, %arg5
- jge _large_enough_update_\@
+ jge .L_large_enough_update_\@
lea (%arg4,%r11,1), %r10
mov %r13, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_large_enough_update_\@:
+.L_large_enough_update_\@:
sub $16, %r11
add %r13, %r11
@@ -374,7 +374,7 @@ _large_enough_update_\@:
# shift right 16-r13 bytes
pshufb %xmm2, %xmm1
-_data_read_\@:
+.L_data_read_\@:
lea ALL_F+16(%rip), %r12
sub %r13, %r12
@@ -409,19 +409,19 @@ _data_read_\@:
# Output %r13 bytes
movq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (%arg3 , %r11, 1)
add $8, %r11
psrldq $8, %xmm0
movq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
mov %al, (%arg3, %r11, 1)
add $1, %r11
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_multiple_of_16_bytes_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_multiple_of_16_bytes_\@:
.endm
# GCM_COMPLETE Finishes update of tag of last partial block
@@ -434,11 +434,11 @@ _multiple_of_16_bytes_\@:
mov PBlockLen(%arg2), %r12
test %r12, %r12
- je _partial_done\@
+ je .L_partial_done\@
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-_partial_done\@:
+.L_partial_done\@:
mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
movd %r12d, %xmm15 # len(A) in %xmm15
@@ -457,44 +457,44 @@ _partial_done\@:
movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
pxor %xmm8, %xmm0
-_return_T_\@:
+.L_return_T_\@:
mov \AUTHTAG, %r10 # %r10 = authTag
mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
cmp $16, %r11
- je _T_16_\@
+ je .L_T_16_\@
cmp $8, %r11
- jl _T_4_\@
-_T_8_\@:
+ jl .L_T_4_\@
+.L_T_8_\@:
movq %xmm0, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
psrldq $8, %xmm0
test %r11, %r11
- je _return_T_done_\@
-_T_4_\@:
+ je .L_return_T_done_\@
+.L_T_4_\@:
movd %xmm0, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
psrldq $4, %xmm0
test %r11, %r11
- je _return_T_done_\@
-_T_123_\@:
+ je .L_return_T_done_\@
+.L_T_123_\@:
movd %xmm0, %eax
cmp $2, %r11
- jl _T_1_\@
+ jl .L_T_1_\@
mov %ax, (%r10)
cmp $2, %r11
- je _return_T_done_\@
+ je .L_return_T_done_\@
add $2, %r10
sar $16, %eax
-_T_1_\@:
+.L_T_1_\@:
mov %al, (%r10)
- jmp _return_T_done_\@
-_T_16_\@:
+ jmp .L_return_T_done_\@
+.L_T_16_\@:
movdqu %xmm0, (%r10)
-_return_T_done_\@:
+.L_return_T_done_\@:
.endm
#ifdef __x86_64__
@@ -563,30 +563,30 @@ _return_T_done_\@:
# Clobbers %rax, DLEN and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
cmp $8, \DLEN
- jl _read_lt8_\@
+ jl .L_read_lt8_\@
mov (\DPTR), %rax
movq %rax, \XMMDst
sub $8, \DLEN
- jz _done_read_partial_block_\@
+ jz .L_done_read_partial_block_\@
xor %eax, %eax
-_read_next_byte_\@:
+.L_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_\@
+ jnz .L_read_next_byte_\@
movq %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
- jmp _done_read_partial_block_\@
-_read_lt8_\@:
+ jmp .L_done_read_partial_block_\@
+.L_read_lt8_\@:
xor %eax, %eax
-_read_next_byte_lt8_\@:
+.L_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_lt8_\@
+ jnz .L_read_next_byte_lt8_\@
movq %rax, \XMMDst
-_done_read_partial_block_\@:
+.L_done_read_partial_block_\@:
.endm
# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
@@ -600,8 +600,8 @@ _done_read_partial_block_\@:
pxor \TMP6, \TMP6
cmp $16, %r11
- jl _get_AAD_rest\@
-_get_AAD_blocks\@:
+ jl .L_get_AAD_rest\@
+.L_get_AAD_blocks\@:
movdqu (%r10), \TMP7
pshufb %xmm14, \TMP7 # byte-reflect the AAD data
pxor \TMP7, \TMP6
@@ -609,14 +609,14 @@ _get_AAD_blocks\@:
add $16, %r10
sub $16, %r11
cmp $16, %r11
- jge _get_AAD_blocks\@
+ jge .L_get_AAD_blocks\@
movdqu \TMP6, \TMP7
/* read the last <16B of AAD */
-_get_AAD_rest\@:
+.L_get_AAD_rest\@:
test %r11, %r11
- je _get_AAD_done\@
+ je .L_get_AAD_done\@
READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
pshufb %xmm14, \TMP7 # byte-reflect the AAD data
@@ -624,7 +624,7 @@ _get_AAD_rest\@:
GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
movdqu \TMP7, \TMP6
-_get_AAD_done\@:
+.L_get_AAD_done\@:
movdqu \TMP6, AadHash(%arg2)
.endm
@@ -637,21 +637,21 @@ _get_AAD_done\@:
AAD_HASH operation
mov PBlockLen(%arg2), %r13
test %r13, %r13
- je _partial_block_done_\@ # Leave Macro if no partial blocks
+ je .L_partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
- jl _fewer_than_16_bytes_\@
+ jl .L_fewer_than_16_bytes_\@
movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_fewer_than_16_bytes_\@:
+.L_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
mov PBlockLen(%arg2), %r13
-_data_read_\@: # Finished reading in data
+.L_data_read_\@: # Finished reading in data
movdqu PBlockEncKey(%arg2), %xmm9
movdqu HashKey(%arg2), %xmm13
@@ -674,9 +674,9 @@ _data_read_\@: # Finished reading in data
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_1_\@
+ jge .L_no_extra_mask_1_\@
sub %r10, %r12
-_no_extra_mask_1_\@:
+.L_no_extra_mask_1_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -689,17 +689,17 @@ _no_extra_mask_1_\@:
pxor %xmm3, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_1_\@
+ jl .L_partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
- jmp _dec_done_\@
-_partial_incomplete_1_\@:
+ jmp .L_dec_done_\@
+.L_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-_dec_done_\@:
+.L_dec_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
.else
pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
@@ -710,9 +710,9 @@ _dec_done_\@:
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_2_\@
+ jge .L_no_extra_mask_2_\@
sub %r10, %r12
-_no_extra_mask_2_\@:
+.L_no_extra_mask_2_\@:
movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -724,17 +724,17 @@ _no_extra_mask_2_\@:
pxor %xmm9, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_2_\@
+ jl .L_partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax, %eax
mov %rax, PBlockLen(%arg2)
- jmp _encode_done_\@
-_partial_incomplete_2_\@:
+ jmp .L_encode_done_\@
+.L_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
-_encode_done_\@:
+.L_encode_done_\@:
movdqu \AAD_HASH, AadHash(%arg2)
movdqa SHUF_MASK(%rip), %xmm10
@@ -744,32 +744,32 @@ _encode_done_\@:
.endif
# output encrypted Bytes
test %r10, %r10
- jl _partial_fill_\@
+ jl .L_partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
- jmp _count_set_\@
-_partial_fill_\@:
+ jmp .L_count_set_\@
+.L_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
-_count_set_\@:
+.L_count_set_\@:
movdqa %xmm9, %xmm0
movq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
movq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_partial_block_done_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_partial_block_done_\@:
.endm # PARTIAL_BLOCK
/*
@@ -813,14 +813,14 @@ _partial_block_done_\@:
shr $2,%eax # 128->4, 192->6, 256->8
add $5,%eax # 128->9, 192->11, 256->13
-aes_loop_initial_\@:
+.Laes_loop_initial_\@:
MOVADQ (%r10),\TMP1
.irpc index, \i_seq
aesenc \TMP1, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_initial_\@
+ jnz .Laes_loop_initial_\@
MOVADQ (%r10), \TMP1
.irpc index, \i_seq
@@ -861,7 +861,7 @@ aes_loop_initial_\@:
GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
.endif
cmp $64, %r13
- jl _initial_blocks_done\@
+ jl .L_initial_blocks_done\@
# no need for precomputed values
/*
*
@@ -908,18 +908,18 @@ aes_loop_initial_\@:
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_pre_done\@
+ jz .Laes_loop_pre_done\@
-aes_loop_pre_\@:
+.Laes_loop_pre_\@:
MOVADQ (%r10),\TMP2
.irpc index, 1234
aesenc \TMP2, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_pre_\@
+ jnz .Laes_loop_pre_\@
-aes_loop_pre_done\@:
+.Laes_loop_pre_done\@:
MOVADQ (%r10), \TMP2
aesenclast \TMP2, \XMM1
aesenclast \TMP2, \XMM2
@@ -963,7 +963,7 @@ aes_loop_pre_done\@:
pshufb %xmm14, \XMM3 # perform a 16 byte swap
pshufb %xmm14, \XMM4 # perform a 16 byte swap
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
@@ -1095,18 +1095,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_enc_done\@
+ jz .Laes_loop_par_enc_done\@
-aes_loop_par_enc\@:
+.Laes_loop_par_enc\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_par_enc\@
+ jnz .Laes_loop_par_enc\@
-aes_loop_par_enc_done\@:
+.Laes_loop_par_enc_done\@:
MOVADQ (%r10), \TMP3
aesenclast \TMP3, \XMM1 # Round 10
aesenclast \TMP3, \XMM2
@@ -1303,18 +1303,18 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
mov keysize,%eax
shr $2,%eax # 128->4, 192->6, 256->8
sub $4,%eax # 128->0, 192->2, 256->4
- jz aes_loop_par_dec_done\@
+ jz .Laes_loop_par_dec_done\@
-aes_loop_par_dec\@:
+.Laes_loop_par_dec\@:
MOVADQ (%r10),\TMP3
.irpc index, 1234
aesenc \TMP3, %xmm\index
.endr
add $16,%r10
sub $1,%eax
- jnz aes_loop_par_dec\@
+ jnz .Laes_loop_par_dec\@
-aes_loop_par_dec_done\@:
+.Laes_loop_par_dec_done\@:
MOVADQ (%r10), \TMP3
aesenclast \TMP3, \XMM1 # last round
aesenclast \TMP3, \XMM2
@@ -2717,7 +2717,7 @@ SYM_FUNC_END(aesni_cts_cbc_dec)
* BSWAP_MASK == endian swapping mask
*/
SYM_FUNC_START_LOCAL(_aesni_inc_init)
- movaps .Lbswap_mask, BSWAP_MASK
+ movaps .Lbswap_mask(%rip), BSWAP_MASK
movaps IV, CTR
pshufb BSWAP_MASK, CTR
mov $1, TCTR_LOW
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 0852ab573fd3..46cddd78857b 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -154,30 +154,6 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
-.section .rodata
-.align 16
-.type aad_shift_arr, @object
-.size aad_shift_arr, 272
-aad_shift_arr:
- .octa 0xffffffffffffffffffffffffffffffff
- .octa 0xffffffffffffffffffffffffffffff0C
- .octa 0xffffffffffffffffffffffffffff0D0C
- .octa 0xffffffffffffffffffffffffff0E0D0C
- .octa 0xffffffffffffffffffffffff0F0E0D0C
- .octa 0xffffffffffffffffffffff0C0B0A0908
- .octa 0xffffffffffffffffffff0D0C0B0A0908
- .octa 0xffffffffffffffffff0E0D0C0B0A0908
- .octa 0xffffffffffffffff0F0E0D0C0B0A0908
- .octa 0xffffffffffffff0C0B0A090807060504
- .octa 0xffffffffffff0D0C0B0A090807060504
- .octa 0xffffffffff0E0D0C0B0A090807060504
- .octa 0xffffffff0F0E0D0C0B0A090807060504
- .octa 0xffffff0C0B0A09080706050403020100
- .octa 0xffff0D0C0B0A09080706050403020100
- .octa 0xff0E0D0C0B0A09080706050403020100
- .octa 0x0F0E0D0C0B0A09080706050403020100
-
-
.text
@@ -302,68 +278,68 @@ VARIABLE_OFFSET = 16*8
mov %r13, %r12
shr $4, %r12
and $7, %r12
- jz _initial_num_blocks_is_0\@
+ jz .L_initial_num_blocks_is_0\@
cmp $7, %r12
- je _initial_num_blocks_is_7\@
+ je .L_initial_num_blocks_is_7\@
cmp $6, %r12
- je _initial_num_blocks_is_6\@
+ je .L_initial_num_blocks_is_6\@
cmp $5, %r12
- je _initial_num_blocks_is_5\@
+ je .L_initial_num_blocks_is_5\@
cmp $4, %r12
- je _initial_num_blocks_is_4\@
+ je .L_initial_num_blocks_is_4\@
cmp $3, %r12
- je _initial_num_blocks_is_3\@
+ je .L_initial_num_blocks_is_3\@
cmp $2, %r12
- je _initial_num_blocks_is_2\@
+ je .L_initial_num_blocks_is_2\@
- jmp _initial_num_blocks_is_1\@
+ jmp .L_initial_num_blocks_is_1\@
-_initial_num_blocks_is_7\@:
+.L_initial_num_blocks_is_7\@:
\INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_6\@:
+.L_initial_num_blocks_is_6\@:
\INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_5\@:
+.L_initial_num_blocks_is_5\@:
\INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_4\@:
+.L_initial_num_blocks_is_4\@:
\INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_3\@:
+.L_initial_num_blocks_is_3\@:
\INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_2\@:
+.L_initial_num_blocks_is_2\@:
\INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_1\@:
+.L_initial_num_blocks_is_1\@:
\INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
+ jmp .L_initial_blocks_encrypted\@
-_initial_num_blocks_is_0\@:
+.L_initial_num_blocks_is_0\@:
\INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-_initial_blocks_encrypted\@:
+.L_initial_blocks_encrypted\@:
test %r13, %r13
- je _zero_cipher_left\@
+ je .L_zero_cipher_left\@
sub $128, %r13
- je _eight_cipher_left\@
+ je .L_eight_cipher_left\@
@@ -373,9 +349,9 @@ _initial_blocks_encrypted\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-_encrypt_by_8_new\@:
+.L_encrypt_by_8_new\@:
cmp $(255-8), %r15d
- jg _encrypt_by_8\@
+ jg .L_encrypt_by_8\@
@@ -383,30 +359,30 @@ _encrypt_by_8_new\@:
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
add $128, %r11
sub $128, %r13
- jne _encrypt_by_8_new\@
+ jne .L_encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
+ jmp .L_eight_cipher_left\@
-_encrypt_by_8\@:
+.L_encrypt_by_8\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
- jne _encrypt_by_8_new\@
+ jne .L_encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-_eight_cipher_left\@:
+.L_eight_cipher_left\@:
\GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-_zero_cipher_left\@:
+.L_zero_cipher_left\@:
vmovdqu %xmm14, AadHash(arg2)
vmovdqu %xmm9, CurCount(arg2)
@@ -414,7 +390,7 @@ _zero_cipher_left\@:
mov arg5, %r13
and $15, %r13 # r13 = (arg5 mod 16)
- je _multiple_of_16_bytes\@
+ je .L_multiple_of_16_bytes\@
# handle the last <16 Byte block separately
@@ -428,7 +404,7 @@ _zero_cipher_left\@:
vmovdqu %xmm9, PBlockEncKey(arg2)
cmp $16, arg5
- jge _large_enough_update\@
+ jge .L_large_enough_update\@
lea (arg4,%r11,1), %r10
mov %r13, %r12
@@ -440,9 +416,9 @@ _zero_cipher_left\@:
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
- jmp _final_ghash_mul\@
+ jmp .L_final_ghash_mul\@
-_large_enough_update\@:
+.L_large_enough_update\@:
sub $16, %r11
add %r13, %r11
@@ -461,7 +437,7 @@ _large_enough_update\@:
# shift right 16-r13 bytes
vpshufb %xmm2, %xmm1, %xmm1
-_final_ghash_mul\@:
+.L_final_ghash_mul\@:
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
@@ -490,7 +466,7 @@ _final_ghash_mul\@:
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left\@
+ jle .L_less_than_8_bytes_left\@
mov %rax, (arg3 , %r11)
add $8, %r11
@@ -498,15 +474,15 @@ _final_ghash_mul\@:
vmovq %xmm9, %rax
sub $8, %r13
-_less_than_8_bytes_left\@:
+.L_less_than_8_bytes_left\@:
movb %al, (arg3 , %r11)
add $1, %r11
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left\@
+ jne .L_less_than_8_bytes_left\@
#############################
-_multiple_of_16_bytes\@:
+.L_multiple_of_16_bytes\@:
.endm
@@ -519,12 +495,12 @@ _multiple_of_16_bytes\@:
mov PBlockLen(arg2), %r12
test %r12, %r12
- je _partial_done\@
+ je .L_partial_done\@
#GHASH computation for the last <16 Byte block
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-_partial_done\@:
+.L_partial_done\@:
mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
@@ -547,49 +523,49 @@ _partial_done\@:
-_return_T\@:
+.L_return_T\@:
mov \AUTH_TAG, %r10 # r10 = authTag
mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
cmp $16, %r11
- je _T_16\@
+ je .L_T_16\@
cmp $8, %r11
- jl _T_4\@
+ jl .L_T_4\@
-_T_8\@:
+.L_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
test %r11, %r11
- je _return_T_done\@
-_T_4\@:
+ je .L_return_T_done\@
+.L_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
test %r11, %r11
- je _return_T_done\@
-_T_123\@:
+ je .L_return_T_done\@
+.L_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
- jl _T_1\@
+ jl .L_T_1\@
mov %ax, (%r10)
cmp $2, %r11
- je _return_T_done\@
+ je .L_return_T_done\@
add $2, %r10
sar $16, %eax
-_T_1\@:
+.L_T_1\@:
mov %al, (%r10)
- jmp _return_T_done\@
+ jmp .L_return_T_done\@
-_T_16\@:
+.L_T_16\@:
vmovdqu %xmm9, (%r10)
-_return_T_done\@:
+.L_return_T_done\@:
.endm
.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
@@ -603,8 +579,8 @@ _return_T_done\@:
vpxor \T8, \T8, \T8
vpxor \T7, \T7, \T7
cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
+ jl .L_get_AAD_rest8\@
+.L_get_AAD_blocks\@:
vmovdqu (%r10), \T7
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T7, \T8, \T8
@@ -613,29 +589,29 @@ _get_AAD_blocks\@:
sub $16, %r12
sub $16, %r11
cmp $16, %r11
- jge _get_AAD_blocks\@
+ jge .L_get_AAD_blocks\@
vmovdqu \T8, \T7
test %r11, %r11
- je _get_AAD_done\@
+ je .L_get_AAD_done\@
vpxor \T7, \T7, \T7
/* read the last <16B of AAD. since we have at least 4B of
data right after the AAD (the ICV, and maybe some CT), we can
read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
+.L_get_AAD_rest8\@:
cmp $4, %r11
- jle _get_AAD_rest4\@
+ jle .L_get_AAD_rest4\@
movq (%r10), \T1
add $8, %r10
sub $8, %r11
vpslldq $8, \T1, \T1
vpsrldq $8, \T7, \T7
vpxor \T1, \T7, \T7
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
+ jmp .L_get_AAD_rest8\@
+.L_get_AAD_rest4\@:
test %r11, %r11
- jle _get_AAD_rest0\@
+ jle .L_get_AAD_rest0\@
mov (%r10), %eax
movq %rax, \T1
add $4, %r10
@@ -643,20 +619,22 @@ _get_AAD_rest4\@:
vpslldq $12, \T1, \T1
vpsrldq $4, \T7, \T7
vpxor \T1, \T7, \T7
-_get_AAD_rest0\@:
+.L_get_AAD_rest0\@:
/* finalize: shift out the extra bytes we read, and align
left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- vmovdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, \T7, \T7
-_get_AAD_rest_final\@:
+ vpshufb and a pair of shuffle masks */
+ leaq ALL_F(%rip), %r11
+ subq %r12, %r11
+ vmovdqu 16(%r11), \T1
+ andq $~3, %r11
+ vpshufb (%r11), \T7, \T7
+ vpand \T1, \T7, \T7
+.L_get_AAD_rest_final\@:
vpshufb SHUF_MASK(%rip), \T7, \T7
vpxor \T8, \T7, \T7
\GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
-_get_AAD_done\@:
+.L_get_AAD_done\@:
vmovdqu \T7, AadHash(arg2)
.endm
@@ -707,28 +685,28 @@ _get_AAD_done\@:
vpxor \XMMDst, \XMMDst, \XMMDst
cmp $8, \DLEN
- jl _read_lt8_\@
+ jl .L_read_lt8_\@
mov (\DPTR), %rax
vpinsrq $0, %rax, \XMMDst, \XMMDst
sub $8, \DLEN
- jz _done_read_partial_block_\@
+ jz .L_done_read_partial_block_\@
xor %eax, %eax
-_read_next_byte_\@:
+.L_read_next_byte_\@:
shl $8, %rax
mov 7(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_\@
+ jnz .L_read_next_byte_\@
vpinsrq $1, %rax, \XMMDst, \XMMDst
- jmp _done_read_partial_block_\@
-_read_lt8_\@:
+ jmp .L_done_read_partial_block_\@
+.L_read_lt8_\@:
xor %eax, %eax
-_read_next_byte_lt8_\@:
+.L_read_next_byte_lt8_\@:
shl $8, %rax
mov -1(\DPTR, \DLEN, 1), %al
dec \DLEN
- jnz _read_next_byte_lt8_\@
+ jnz .L_read_next_byte_lt8_\@
vpinsrq $0, %rax, \XMMDst, \XMMDst
-_done_read_partial_block_\@:
+.L_done_read_partial_block_\@:
.endm
# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
@@ -740,21 +718,21 @@ _done_read_partial_block_\@:
AAD_HASH ENC_DEC
mov PBlockLen(arg2), %r13
test %r13, %r13
- je _partial_block_done_\@ # Leave Macro if no partial blocks
+ je .L_partial_block_done_\@ # Leave Macro if no partial blocks
# Read in input data without over reading
cmp $16, \PLAIN_CYPH_LEN
- jl _fewer_than_16_bytes_\@
+ jl .L_fewer_than_16_bytes_\@
vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
- jmp _data_read_\@
+ jmp .L_data_read_\@
-_fewer_than_16_bytes_\@:
+.L_fewer_than_16_bytes_\@:
lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
mov \PLAIN_CYPH_LEN, %r12
READ_PARTIAL_BLOCK %r10 %r12 %xmm1
mov PBlockLen(arg2), %r13
-_data_read_\@: # Finished reading in data
+.L_data_read_\@: # Finished reading in data
vmovdqu PBlockEncKey(arg2), %xmm9
vmovdqu HashKey(arg2), %xmm13
@@ -777,9 +755,9 @@ _data_read_\@: # Finished reading in data
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_1_\@
+ jge .L_no_extra_mask_1_\@
sub %r10, %r12
-_no_extra_mask_1_\@:
+.L_no_extra_mask_1_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -792,17 +770,17 @@ _no_extra_mask_1_\@:
vpxor %xmm3, \AAD_HASH, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_1_\@
+ jl .L_partial_incomplete_1_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
- jmp _dec_done_\@
-_partial_incomplete_1_\@:
+ jmp .L_dec_done_\@
+.L_partial_incomplete_1_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-_dec_done_\@:
+.L_dec_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
@@ -813,9 +791,9 @@ _dec_done_\@:
sub $16, %r10
# Determine if if partial block is not being filled and
# shift mask accordingly
- jge _no_extra_mask_2_\@
+ jge .L_no_extra_mask_2_\@
sub %r10, %r12
-_no_extra_mask_2_\@:
+.L_no_extra_mask_2_\@:
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
# get the appropriate mask to mask out bottom r13 bytes of xmm9
@@ -827,17 +805,17 @@ _no_extra_mask_2_\@:
vpxor %xmm9, \AAD_HASH, \AAD_HASH
test %r10, %r10
- jl _partial_incomplete_2_\@
+ jl .L_partial_incomplete_2_\@
# GHASH computation for the last <16 Byte block
\GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
xor %eax,%eax
mov %rax, PBlockLen(arg2)
- jmp _encode_done_\@
-_partial_incomplete_2_\@:
+ jmp .L_encode_done_\@
+.L_partial_incomplete_2_\@:
add \PLAIN_CYPH_LEN, PBlockLen(arg2)
-_encode_done_\@:
+.L_encode_done_\@:
vmovdqu \AAD_HASH, AadHash(arg2)
vmovdqa SHUF_MASK(%rip), %xmm10
@@ -847,32 +825,32 @@ _encode_done_\@:
.endif
# output encrypted Bytes
test %r10, %r10
- jl _partial_fill_\@
+ jl .L_partial_fill_\@
mov %r13, %r12
mov $16, %r13
# Set r13 to be the number of bytes to write out
sub %r12, %r13
- jmp _count_set_\@
-_partial_fill_\@:
+ jmp .L_count_set_\@
+.L_partial_fill_\@:
mov \PLAIN_CYPH_LEN, %r13
-_count_set_\@:
+.L_count_set_\@:
vmovdqa %xmm9, %xmm0
vmovq %xmm0, %rax
cmp $8, %r13
- jle _less_than_8_bytes_left_\@
+ jle .L_less_than_8_bytes_left_\@
mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $8, \DATA_OFFSET
psrldq $8, %xmm0
vmovq %xmm0, %rax
sub $8, %r13
-_less_than_8_bytes_left_\@:
+.L_less_than_8_bytes_left_\@:
movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
add $1, \DATA_OFFSET
shr $8, %rax
sub $1, %r13
- jne _less_than_8_bytes_left_\@
-_partial_block_done_\@:
+ jne .L_less_than_8_bytes_left_\@
+.L_partial_block_done_\@:
.endm # PARTIAL_BLOCK
###############################################################################
@@ -1073,7 +1051,7 @@ _partial_block_done_\@:
vmovdqa \XMM8, \T3
cmp $128, %r13
- jl _initial_blocks_done\@ # no need for precomputed constants
+ jl .L_initial_blocks_done\@ # no need for precomputed constants
###############################################################################
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
@@ -1215,7 +1193,7 @@ _partial_block_done_\@:
###############################################################################
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
@@ -2023,7 +2001,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
vmovdqa \XMM8, \T3
cmp $128, %r13
- jl _initial_blocks_done\@ # no need for precomputed constants
+ jl .L_initial_blocks_done\@ # no need for precomputed constants
###############################################################################
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
@@ -2167,7 +2145,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
###############################################################################
-_initial_blocks_done\@:
+.L_initial_blocks_done\@:
.endm
diff --git a/arch/x86/crypto/aria-aesni-avx-asm_64.S b/arch/x86/crypto/aria-aesni-avx-asm_64.S
index c75fd7d015ed..9556dacd9841 100644
--- a/arch/x86/crypto/aria-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/aria-aesni-avx-asm_64.S
@@ -7,13 +7,10 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
+#include <asm/asm-offsets.h>
#include <asm/frame.h>
-/* struct aria_ctx: */
-#define enc_key 0
-#define dec_key 272
-#define rounds 544
-
/* register macros */
#define CTX %rdi
@@ -83,7 +80,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -135,7 +132,7 @@
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -270,34 +267,44 @@
#define aria_ark_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
- t0, rk, idx, round) \
+ t0, t1, t2, rk, \
+ idx, round) \
/* AddRoundKey */ \
- vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
- vpxor t0, x0, x0; \
- vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
- vpxor t0, x1, x1; \
- vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
- vpxor t0, x2, x2; \
- vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
- vpxor t0, x3, x3; \
- vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
- vpxor t0, x4, x4; \
- vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
- vpxor t0, x5, x5; \
- vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
- vpxor t0, x6, x6; \
- vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
- vpxor t0, x7, x7;
-
+ vbroadcastss ((round * 16) + idx + 0)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x0, x0; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x1, x1; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x2, x2; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x3, x3; \
+ vbroadcastss ((round * 16) + idx + 4)(rk), t0; \
+ vpsrld $24, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x4, x4; \
+ vpsrld $16, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x5, x5; \
+ vpsrld $8, t0, t2; \
+ vpshufb t1, t2, t2; \
+ vpxor t2, x6, x6; \
+ vpshufb t1, t0, t2; \
+ vpxor t2, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpbroadcastq .Ltf_s2_bitmatrix, t0; \
- vpbroadcastq .Ltf_inv_bitmatrix, t1; \
- vpbroadcastq .Ltf_id_bitmatrix, t2; \
- vpbroadcastq .Ltf_aff_bitmatrix, t3; \
- vpbroadcastq .Ltf_x2_bitmatrix, t4; \
+ vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \
+ vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \
+ vmovdqa .Ltf_id_bitmatrix(%rip), t2; \
+ vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \
+ vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \
vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
@@ -311,18 +318,19 @@
vgf2p8affineinvqb $0, t2, x3, x3; \
vgf2p8affineinvqb $0, t2, x7, x7
+#endif /* CONFIG_AS_GFNI */
+
#define aria_sbox_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
t0, t1, t2, t3, \
t4, t5, t6, t7) \
- vpxor t7, t7, t7; \
- vmovdqa .Linv_shift_row, t0; \
- vmovdqa .Lshift_row, t1; \
- vpbroadcastd .L0f0f0f0f, t6; \
- vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \
- vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \
- vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \
- vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \
+ vmovdqa .Linv_shift_row(%rip), t0; \
+ vmovdqa .Lshift_row(%rip), t1; \
+ vbroadcastss .L0f0f0f0f(%rip), t6; \
+ vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
\
vaesenclast t7, x0, x0; \
vaesenclast t7, x4, x4; \
@@ -413,8 +421,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -429,7 +438,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -467,8 +476,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -483,7 +493,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
y0, y1, y2, y3, y4, y5, y6, y7); \
@@ -521,14 +531,15 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
+ y0, y7, y2, rk, 8, last_round); \
\
aria_store_state_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -538,25 +549,27 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
y0, y1, y2, y3, y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
+ y0, y7, y2, rk, 0, last_round); \
\
aria_load_state_8way(y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, 8);
+#ifdef CONFIG_AS_GFNI
#define aria_fe_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -573,7 +586,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -613,8 +626,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -631,7 +645,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -671,8 +685,9 @@
y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, rk, round, last_round) \
+ vpxor y7, y7, y7; \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, round); \
+ y0, y7, y2, rk, 8, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -680,7 +695,7 @@
y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 8, last_round); \
+ y0, y7, y2, rk, 8, last_round); \
\
aria_store_state_8way(x0, x1, x2, x3, \
x4, x5, x6, x7, \
@@ -690,7 +705,7 @@
x4, x5, x6, x7, \
mem_tmp, 0); \
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, round); \
+ y0, y7, y2, rk, 0, round); \
\
aria_sbox_8way_gfni(x2, x3, x0, x1, \
x6, x7, x4, x5, \
@@ -698,12 +713,14 @@
y4, y5, y6, y7); \
\
aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
- y0, rk, 0, last_round); \
+ y0, y7, y2, rk, 0, last_round); \
\
aria_load_state_8way(y0, y1, y2, y3, \
y4, y5, y6, y7, \
mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
.section .rodata.cst16, "aM", @progbits, 16
.align 16
@@ -755,8 +772,7 @@
.Ltf_hi__x2__and__fwd_aff:
.octa 0x3F893781E95FE1576CDA64D2BA0CB204
-.section .rodata.cst8, "aM", @progbits, 8
-.align 8
+#ifdef CONFIG_AS_GFNI
/* AES affine: */
#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
.Ltf_aff_bitmatrix:
@@ -768,6 +784,14 @@
BV8(0, 1, 1, 1, 1, 1, 0, 0),
BV8(0, 0, 1, 1, 1, 1, 1, 0),
BV8(0, 0, 0, 1, 1, 1, 1, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
/* AES inverse affine: */
#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
@@ -780,6 +804,14 @@
BV8(0, 0, 1, 0, 1, 0, 0, 1),
BV8(1, 0, 0, 1, 0, 1, 0, 0),
BV8(0, 1, 0, 0, 1, 0, 1, 0))
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
/* S2: */
#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
@@ -792,6 +824,14 @@
BV8(1, 1, 0, 0, 1, 1, 1, 0),
BV8(0, 1, 1, 0, 0, 0, 1, 1),
BV8(1, 1, 1, 1, 0, 1, 1, 0))
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
/* X2: */
#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
@@ -804,6 +844,14 @@
BV8(0, 1, 1, 0, 1, 0, 1, 1),
BV8(1, 0, 1, 1, 1, 1, 0, 1),
BV8(1, 0, 0, 1, 0, 0, 1, 1))
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
/* Identity matrix: */
.Ltf_id_bitmatrix:
@@ -815,6 +863,15 @@
BV8(0, 0, 0, 0, 0, 1, 0, 0),
BV8(0, 0, 0, 0, 0, 0, 1, 0),
BV8(0, 0, 0, 0, 0, 0, 0, 1))
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+#endif /* CONFIG_AS_GFNI */
/* 4-bit mask */
.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
@@ -873,7 +930,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 10);
- cmpl $12, rounds(CTX);
+ cmpl $12, ARIA_CTX_rounds(CTX);
jne .Laria_192;
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -886,7 +943,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15,
%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 12);
- cmpl $14, rounds(CTX);
+ cmpl $14, ARIA_CTX_rounds(CTX);
jne .Laria_256;
aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -913,7 +970,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_crypt_16way)
-SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -922,7 +979,7 @@ SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
FRAME_BEGIN
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -938,7 +995,7 @@ SYM_FUNC_START(aria_aesni_avx_encrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_encrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_decrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -947,7 +1004,7 @@ SYM_FUNC_START(aria_aesni_avx_decrypt_16way)
FRAME_BEGIN
- leaq dec_key(CTX), %r9;
+ leaq ARIA_CTX_dec_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1039,7 +1096,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way)
-SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -1055,7 +1112,7 @@ SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
leaq (%rdx), %r11;
leaq (%rcx), %rsi;
leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
call __aria_aesni_avx_crypt_16way;
@@ -1083,6 +1140,7 @@ SYM_FUNC_START(aria_aesni_avx_ctr_crypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way)
+#ifdef CONFIG_AS_GFNI
SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
/* input:
* %r9: rk
@@ -1156,7 +1214,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
%xmm0, %xmm1, %xmm2, %xmm3,
%xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 10);
- cmpl $12, rounds(CTX);
+ cmpl $12, ARIA_CTX_rounds(CTX);
jne .Laria_gfni_192;
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1173,7 +1231,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
%xmm0, %xmm1, %xmm2, %xmm3,
%xmm4, %xmm5, %xmm6, %xmm7,
%rax, %r9, 12);
- cmpl $14, rounds(CTX);
+ cmpl $14, ARIA_CTX_rounds(CTX);
jne .Laria_gfni_256;
aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2,
%xmm4, %xmm5, %xmm6, %xmm7,
@@ -1208,7 +1266,7 @@ SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way)
RET;
SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -1217,7 +1275,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
FRAME_BEGIN
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1233,7 +1291,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_encrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
/* input:
* %rdi: ctx, CTX
* %rsi: dst
@@ -1242,7 +1300,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
FRAME_BEGIN
- leaq dec_key(CTX), %r9;
+ leaq ARIA_CTX_dec_key(CTX), %r9;
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -1258,7 +1316,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_decrypt_16way)
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way)
-SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
+SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -1274,7 +1332,7 @@ SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
leaq (%rdx), %r11;
leaq (%rcx), %rsi;
leaq (%rcx), %rdx;
- leaq enc_key(CTX), %r9;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
call __aria_aesni_avx_gfni_crypt_16way;
@@ -1301,3 +1359,4 @@ SYM_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way)
FRAME_END
RET;
SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way)
+#endif /* CONFIG_AS_GFNI */
diff --git a/arch/x86/crypto/aria-aesni-avx2-asm_64.S b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
new file mode 100644
index 000000000000..c60fa2980630
--- /dev/null
+++ b/arch/x86/crypto/aria-aesni-avx2-asm_64.S
@@ -0,0 +1,1441 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 32-way parallel algorithm (AVX2)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpand x, mask4bit, tmp0; \
+ vpandn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxor tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu d2, st0; \
+ vmovdqu d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu a0, st0; \
+ vmovdqu a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu d3, st1; \
+ vmovdqu st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu st0, d2; \
+ vmovdqu st1, d3; \
+ \
+ vmovdqu b0, st0; \
+ vmovdqu b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu st0, b0; \
+ vmovdqu st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu (0 * 32)(rio), x0; \
+ vmovdqu (1 * 32)(rio), x1; \
+ vmovdqu (2 * 32)(rio), x2; \
+ vmovdqu (3 * 32)(rio), x3; \
+ vmovdqu (4 * 32)(rio), x4; \
+ vmovdqu (5 * 32)(rio), x5; \
+ vmovdqu (6 * 32)(rio), x6; \
+ vmovdqu (7 * 32)(rio), x7; \
+ vmovdqu (8 * 32)(rio), y0; \
+ vmovdqu (9 * 32)(rio), y1; \
+ vmovdqu (10 * 32)(rio), y2; \
+ vmovdqu (11 * 32)(rio), y3; \
+ vmovdqu (12 * 32)(rio), y4; \
+ vmovdqu (13 * 32)(rio), y5; \
+ vmovdqu (14 * 32)(rio), y6; \
+ vmovdqu (15 * 32)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu x0, 0 * 32(mem_ab); \
+ vmovdqu x1, 1 * 32(mem_ab); \
+ vmovdqu x2, 2 * 32(mem_ab); \
+ vmovdqu x3, 3 * 32(mem_ab); \
+ vmovdqu x4, 4 * 32(mem_ab); \
+ vmovdqu x5, 5 * 32(mem_ab); \
+ vmovdqu x6, 6 * 32(mem_ab); \
+ vmovdqu x7, 7 * 32(mem_ab); \
+ vmovdqu y0, 0 * 32(mem_cd); \
+ vmovdqu y1, 1 * 32(mem_cd); \
+ vmovdqu y2, 2 * 32(mem_cd); \
+ vmovdqu y3, 3 * 32(mem_cd); \
+ vmovdqu y4, 4 * 32(mem_cd); \
+ vmovdqu y5, 5 * 32(mem_cd); \
+ vmovdqu y6, 6 * 32(mem_cd); \
+ vmovdqu y7, 7 * 32(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu x0, 0 * 32(mem); \
+ vmovdqu x1, 1 * 32(mem); \
+ vmovdqu x2, 2 * 32(mem); \
+ vmovdqu x3, 3 * 32(mem); \
+ vmovdqu x4, 4 * 32(mem); \
+ vmovdqu x5, 5 * 32(mem); \
+ vmovdqu x6, 6 * 32(mem); \
+ vmovdqu x7, 7 * 32(mem); \
+ vmovdqu y0, 8 * 32(mem); \
+ vmovdqu y1, 9 * 32(mem); \
+ vmovdqu y2, 10 * 32(mem); \
+ vmovdqu y3, 11 * 32(mem); \
+ vmovdqu y4, 12 * 32(mem); \
+ vmovdqu y5, 13 * 32(mem); \
+ vmovdqu y6, 14 * 32(mem); \
+ vmovdqu y7, 15 * 32(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \
+ vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \
+ vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \
+ vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \
+ vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \
+ vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \
+ vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \
+ vmovdqu x7, ((idx + 7) * 32)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \
+ vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \
+ vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \
+ vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \
+ vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \
+ vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \
+ vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \
+ vmovdqu ((idx + 7) * 32)(mem_tmp), x7;
+
+#define aria_ark_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, rk, idx, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \
+ vpxor t0, x0, x0; \
+ vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \
+ vpxor t0, x1, x1; \
+ vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \
+ vpxor t0, x2, x2; \
+ vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \
+ vpxor t0, x3, x3; \
+ vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \
+ vpxor t0, x4, x4; \
+ vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \
+ vpxor t0, x5, x5; \
+ vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \
+ vpxor t0, x6, x6; \
+ vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \
+ vpxor t0, x7, x7;
+
+#ifdef CONFIG_AS_GFNI
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7
+
+#endif /* CONFIG_AS_GFNI */
+#define aria_sbox_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpxor t7, t7, t7; \
+ vpxor t6, t6, t6; \
+ vbroadcasti128 .Linv_shift_row(%rip), t0; \
+ vbroadcasti128 .Lshift_row(%rip), t1; \
+ vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \
+ vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \
+ vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \
+ vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \
+ \
+ vextracti128 $1, x0, t6##_x; \
+ vaesenclast t7##_x, x0##_x, x0##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x0, x0; \
+ \
+ vextracti128 $1, x4, t6##_x; \
+ vaesenclast t7##_x, x4##_x, x4##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x4, x4; \
+ \
+ vextracti128 $1, x1, t6##_x; \
+ vaesenclast t7##_x, x1##_x, x1##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x1, x1; \
+ \
+ vextracti128 $1, x5, t6##_x; \
+ vaesenclast t7##_x, x5##_x, x5##_x; \
+ vaesenclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x5, x5; \
+ \
+ vextracti128 $1, x2, t6##_x; \
+ vaesdeclast t7##_x, x2##_x, x2##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x2, x2; \
+ \
+ vextracti128 $1, x6, t6##_x; \
+ vaesdeclast t7##_x, x6##_x, x6##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x6, x6; \
+ \
+ vpbroadcastd .L0f0f0f0f(%rip), t6; \
+ \
+ /* AES inverse shift rows */ \
+ vpshufb t0, x0, x0; \
+ vpshufb t0, x4, x4; \
+ vpshufb t0, x1, x1; \
+ vpshufb t0, x5, x5; \
+ vpshufb t1, x3, x3; \
+ vpshufb t1, x7, x7; \
+ vpshufb t1, x2, x2; \
+ vpshufb t1, x6, x6; \
+ \
+ /* affine transformation for S2 */ \
+ filter_8bit(x1, t2, t3, t6, t0); \
+ /* affine transformation for S2 */ \
+ filter_8bit(x5, t2, t3, t6, t0); \
+ \
+ /* affine transformation for X2 */ \
+ filter_8bit(x3, t4, t5, t6, t0); \
+ /* affine transformation for X2 */ \
+ filter_8bit(x7, t4, t5, t6, t0); \
+ \
+ vpxor t6, t6, t6; \
+ vextracti128 $1, x3, t6##_x; \
+ vaesdeclast t7##_x, x3##_x, x3##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x3, x3; \
+ \
+ vextracti128 $1, x7, t6##_x; \
+ vaesdeclast t7##_x, x7##_x, x7##_x; \
+ vaesdeclast t7##_x, t6##_x, t6##_x; \
+ vinserti128 $1, t6##_x, x7, x7; \
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxor x0, x3, t0; \
+ vpxor x1, x0, t1; \
+ vpxor x2, x1, t2; \
+ vpxor x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxor t2, x0, x0; \
+ vpxor x1, t3, t3; \
+ vpxor t0, x2, x2; \
+ vpxor t1, x3, x1; \
+ vmovdqu t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxor y4, y0, y0; \
+ vpxor y5, y1, y1; \
+ vpxor y6, y2, y2; \
+ vpxor y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxor x4, x0, x0; \
+ vpxor x5, x1, x1; \
+ vpxor x6, x2, x2; \
+ vpxor x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxor x4, y4, y4; \
+ vpxor x5, y5, y5; \
+ vpxor x6, y6, y6; \
+ vpxor x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxor x0, y0, y0; \
+ vpxor x1, y1, y1; \
+ vpxor x2, y2, y2; \
+ vpxor x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxor y0, x4, x4; \
+ vpxor y1, x5, x5; \
+ vpxor y2, x6, x6; \
+ vpxor y3, x7, x7;
+
+#define aria_fe(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \
+ y0, y1, y2, y3, y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+#ifdef CONFIG_AS_GFNI
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \
+ aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4); \
+ aria_store_state_8way(x3, x2, x1, x0, \
+ x6, x7, x4, x5, \
+ mem_tmp, 0);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 8, last_round); \
+ \
+ aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 8); \
+ \
+ aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, 0); \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, round); \
+ \
+ aria_sbox_8way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ \
+ aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, rk, 0, last_round); \
+ \
+ aria_load_state_8way(y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_tmp, 8);
+#endif /* CONFIG_AS_GFNI */
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+ .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+ .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.Lshift_row:
+ .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03
+ .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+/* AES inverse affine and S2 combined:
+ * 1 1 0 0 0 0 0 1 x0 0
+ * 0 1 0 0 1 0 0 0 x1 0
+ * 1 1 0 0 1 1 1 1 x2 0
+ * 0 1 1 0 1 0 0 1 x3 1
+ * 0 1 0 0 1 1 0 0 * x4 + 0
+ * 0 1 0 1 1 0 0 0 x5 0
+ * 0 0 0 0 0 1 0 1 x6 0
+ * 1 1 1 0 0 1 1 1 x7 1
+ */
+.Ltf_lo__inv_aff__and__s2:
+ .octa 0x92172DA81A9FA520B2370D883ABF8500
+.Ltf_hi__inv_aff__and__s2:
+ .octa 0x2B15FFC1AF917B45E6D8320C625CB688
+
+/* X2 and AES forward affine combined:
+ * 1 0 1 1 0 0 0 1 x0 0
+ * 0 1 1 1 1 0 1 1 x1 0
+ * 0 0 0 1 1 0 1 0 x2 1
+ * 0 1 0 0 0 1 0 0 x3 0
+ * 0 0 1 1 1 0 1 1 * x4 + 0
+ * 0 1 0 0 1 0 0 0 x5 0
+ * 1 1 0 1 0 0 1 1 x6 0
+ * 0 1 0 0 1 0 1 0 x7 0
+ */
+.Ltf_lo__x2__and__fwd_aff:
+ .octa 0xEFAE0544FCBD1657B8F95213ABEA4100
+.Ltf_hi__x2__and__fwd_aff:
+ .octa 0x3F893781E95FE1576CDA64D2BA0CB204
+
+#ifdef CONFIG_AS_GFNI
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#endif /* CONFIG_AS_GFNI */
+
+/* 4-bit mask */
+.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
+.align 4
+.L0f0f0f0f:
+ .long 0x0f0f0f0f
+
+.text
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_192;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_end;
+.Laria_192:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_256;
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_end;
+.Laria_256:
+ aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)
+
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+ movq 8(%r8), %r11;
+ bswapq %r11;
+
+ vbroadcasti128 .Lbswap128_mask (%rip), %ymm6;
+ vpcmpeqd %ymm0, %ymm0, %ymm0;
+ vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%r8), %xmm7;
+ vpshufb %xmm6, %xmm7, %xmm7;
+ vmovdqa %xmm7, %xmm3;
+ inc_le128(%xmm7, %xmm0, %xmm4);
+ vinserti128 $1, %xmm7, %ymm3, %ymm3;
+ vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 32), %r11;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */
+ vpshufb %ymm6, %ymm3, %ymm8;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */
+ vpshufb %ymm6, %ymm3, %ymm9;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */
+ vpshufb %ymm6, %ymm3, %ymm10;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */
+ vpshufb %ymm6, %ymm3, %ymm11;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */
+ vpshufb %ymm6, %ymm3, %ymm12;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */
+ vpshufb %ymm6, %ymm3, %ymm13;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */
+ vpshufb %ymm6, %ymm3, %ymm14;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */
+ vpshufb %ymm6, %ymm3, %ymm15;
+ vpsubq %ymm5, %ymm3, %ymm3; /* +32 */
+ vpshufb %xmm6, %xmm3, %xmm3;
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+ jmp .Lctr_carry_done;
+
+ .Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */
+ vmovdqu %ymm8, (0 * 32)(%rcx);
+ vmovdqu %ymm9, (1 * 32)(%rcx);
+ vmovdqu %ymm10, (2 * 32)(%rcx);
+ vmovdqu %ymm11, (3 * 32)(%rcx);
+ vmovdqu %ymm12, (4 * 32)(%rcx);
+ vmovdqu %ymm13, (5 * 32)(%rcx);
+ vmovdqu %ymm14, (6 * 32)(%rcx);
+ vmovdqu %ymm15, (7 * 32)(%rcx);
+
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */
+ inc_le128(%ymm3, %ymm0, %ymm4);
+ vextracti128 $1, %ymm3, %xmm3;
+ vpshufb %xmm6, %xmm3, %xmm3; /* +32 */
+ vmovdqu %xmm3, (%r8);
+ vmovdqu (0 * 32)(%rcx), %ymm0;
+ vmovdqu (1 * 32)(%rcx), %ymm1;
+ vmovdqu (2 * 32)(%rcx), %ymm2;
+ vmovdqu (3 * 32)(%rcx), %ymm3;
+ vmovdqu (4 * 32)(%rcx), %ymm4;
+ vmovdqu (5 * 32)(%rcx), %ymm5;
+ vmovdqu (6 * 32)(%rcx), %ymm6;
+ vmovdqu (7 * 32)(%rcx), %ymm7;
+
+ .Lctr_carry_done:
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way;
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)
+
+#ifdef CONFIG_AS_GFNI
+SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %ymm0..%ymm15: 16 byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 32(%rax), %r8;
+
+ inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r8);
+ aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 0);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 1);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 2);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 3);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 4);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 5);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 6);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 7);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 8);
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 9);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 11);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 13);
+ aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10,
+ %ymm12, %ymm13, %ymm14, %ymm15,
+ %ymm0, %ymm1, %ymm2, %ymm3,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %rax, %r9, 14);
+ aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2,
+ %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11,
+ %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4,
+ %ymm9, %ymm13, %ymm0, %ymm5,
+ %ymm10, %ymm14, %ymm3, %ymm6,
+ %ymm11, %ymm15, %ymm2, %ymm7,
+ (%rax), (%r8));
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rdx);
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way)
+
+SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_aesni_avx2_ctr_gen_keystream_32way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_aesni_avx2_gfni_crypt_32way;
+
+ vpxor (0 * 32)(%r11), %ymm1, %ymm1;
+ vpxor (1 * 32)(%r11), %ymm0, %ymm0;
+ vpxor (2 * 32)(%r11), %ymm3, %ymm3;
+ vpxor (3 * 32)(%r11), %ymm2, %ymm2;
+ vpxor (4 * 32)(%r11), %ymm4, %ymm4;
+ vpxor (5 * 32)(%r11), %ymm5, %ymm5;
+ vpxor (6 * 32)(%r11), %ymm6, %ymm6;
+ vpxor (7 * 32)(%r11), %ymm7, %ymm7;
+ vpxor (8 * 32)(%r11), %ymm8, %ymm8;
+ vpxor (9 * 32)(%r11), %ymm9, %ymm9;
+ vpxor (10 * 32)(%r11), %ymm10, %ymm10;
+ vpxor (11 * 32)(%r11), %ymm11, %ymm11;
+ vpxor (12 * 32)(%r11), %ymm12, %ymm12;
+ vpxor (13 * 32)(%r11), %ymm13, %ymm13;
+ vpxor (14 * 32)(%r11), %ymm14, %ymm14;
+ vpxor (15 * 32)(%r11), %ymm15, %ymm15;
+ write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7,
+ %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+ %ymm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way)
+#endif /* CONFIG_AS_GFNI */
diff --git a/arch/x86/crypto/aria-avx.h b/arch/x86/crypto/aria-avx.h
index 01e9a01dc157..6e1b2d8a31ed 100644
--- a/arch/x86/crypto/aria-avx.h
+++ b/arch/x86/crypto/aria-avx.h
@@ -5,12 +5,58 @@
#include <linux/types.h>
#define ARIA_AESNI_PARALLEL_BLOCKS 16
-#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * 16)
+#define ARIA_AESNI_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_PARALLEL_BLOCKS)
+
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+#define ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_AESNI_AVX2_PARALLEL_BLOCKS)
+
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCKS 64
+#define ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE (ARIA_BLOCK_SIZE * ARIA_GFNI_AVX512_PARALLEL_BLOCKS)
+
+asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
struct aria_avx_ops {
void (*aria_encrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
void (*aria_decrypt_16way)(const void *ctx, u8 *dst, const u8 *src);
void (*aria_ctr_crypt_16way)(const void *ctx, u8 *dst, const u8 *src,
u8 *keystream, u8 *iv);
+ void (*aria_encrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_32way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_32way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+ void (*aria_encrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_decrypt_64way)(const void *ctx, u8 *dst, const u8 *src);
+ void (*aria_ctr_crypt_64way)(const void *ctx, u8 *dst, const u8 *src,
+ u8 *keystream, u8 *iv);
+
+
};
#endif
diff --git a/arch/x86/crypto/aria-gfni-avx512-asm_64.S b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
new file mode 100644
index 000000000000..860887e5d02e
--- /dev/null
+++ b/arch/x86/crypto/aria-gfni-avx512-asm_64.S
@@ -0,0 +1,971 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ARIA Cipher 64-way parallel algorithm (AVX512)
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+#include <asm/asm-offsets.h>
+#include <linux/cfi_types.h>
+
+/* register macros */
+#define CTX %rdi
+
+
+#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+ vpaddq lo_counter, in, out; \
+ vpcmpuq $1, lo_counter, out, %k1; \
+ kaddb %k1, %k1, %k1; \
+ vpaddq hi_counter1, out, out{%k1};
+
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+ vpandq x, mask4bit, tmp0; \
+ vpandqn x, mask4bit, x; \
+ vpsrld $4, x, x; \
+ \
+ vpshufb tmp0, lo_t, tmp0; \
+ vpshufb x, hi_t, x; \
+ vpxorq tmp0, x, x;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(a0, b0, c0, d0, d2, d3); \
+ transpose_4x4(a1, b1, c1, d1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(a2, b2, c2, d2, b0, b1); \
+ transpose_4x4(a3, b3, c3, d3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+#define debyteslice_16x16b(a0, b0, c0, d0, \
+ a1, b1, c1, d1, \
+ a2, b2, c2, d2, \
+ a3, b3, c3, d3, \
+ st0, st1) \
+ vmovdqu64 d2, st0; \
+ vmovdqu64 d3, st1; \
+ transpose_4x4(a0, a1, a2, a3, d2, d3); \
+ transpose_4x4(b0, b1, b2, b3, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 a0, st0; \
+ vmovdqu64 a1, st1; \
+ transpose_4x4(c0, c1, c2, c3, a0, a1); \
+ transpose_4x4(d0, d1, d2, d3, a0, a1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b(%rip), a0; \
+ vmovdqu64 st1, a1; \
+ vpshufb a0, a2, a2; \
+ vpshufb a0, a3, a3; \
+ vpshufb a0, b0, b0; \
+ vpshufb a0, b1, b1; \
+ vpshufb a0, b2, b2; \
+ vpshufb a0, b3, b3; \
+ vpshufb a0, a1, a1; \
+ vpshufb a0, c0, c0; \
+ vpshufb a0, c1, c1; \
+ vpshufb a0, c2, c2; \
+ vpshufb a0, c3, c3; \
+ vpshufb a0, d0, d0; \
+ vpshufb a0, d1, d1; \
+ vpshufb a0, d2, d2; \
+ vpshufb a0, d3, d3; \
+ vmovdqu64 d3, st1; \
+ vmovdqu64 st0, d3; \
+ vpshufb a0, d3, a0; \
+ vmovdqu64 d2, st0; \
+ \
+ transpose_4x4(c0, d0, a0, b0, d2, d3); \
+ transpose_4x4(c1, d1, a1, b1, d2, d3); \
+ vmovdqu64 st0, d2; \
+ vmovdqu64 st1, d3; \
+ \
+ vmovdqu64 b0, st0; \
+ vmovdqu64 b1, st1; \
+ transpose_4x4(c2, d2, a2, b2, b0, b1); \
+ transpose_4x4(c3, d3, a3, b3, b0, b1); \
+ vmovdqu64 st0, b0; \
+ vmovdqu64 st1, b1; \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack16_pre(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ rio) \
+ vmovdqu64 (0 * 64)(rio), x0; \
+ vmovdqu64 (1 * 64)(rio), x1; \
+ vmovdqu64 (2 * 64)(rio), x2; \
+ vmovdqu64 (3 * 64)(rio), x3; \
+ vmovdqu64 (4 * 64)(rio), x4; \
+ vmovdqu64 (5 * 64)(rio), x5; \
+ vmovdqu64 (6 * 64)(rio), x6; \
+ vmovdqu64 (7 * 64)(rio), x7; \
+ vmovdqu64 (8 * 64)(rio), y0; \
+ vmovdqu64 (9 * 64)(rio), y1; \
+ vmovdqu64 (10 * 64)(rio), y2; \
+ vmovdqu64 (11 * 64)(rio), y3; \
+ vmovdqu64 (12 * 64)(rio), y4; \
+ vmovdqu64 (13 * 64)(rio), y5; \
+ vmovdqu64 (14 * 64)(rio), y6; \
+ vmovdqu64 (15 * 64)(rio), y7;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack16_post(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem_ab, mem_cd) \
+ byteslice_16x16b(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ (mem_ab), (mem_cd)); \
+ \
+ vmovdqu64 x0, 0 * 64(mem_ab); \
+ vmovdqu64 x1, 1 * 64(mem_ab); \
+ vmovdqu64 x2, 2 * 64(mem_ab); \
+ vmovdqu64 x3, 3 * 64(mem_ab); \
+ vmovdqu64 x4, 4 * 64(mem_ab); \
+ vmovdqu64 x5, 5 * 64(mem_ab); \
+ vmovdqu64 x6, 6 * 64(mem_ab); \
+ vmovdqu64 x7, 7 * 64(mem_ab); \
+ vmovdqu64 y0, 0 * 64(mem_cd); \
+ vmovdqu64 y1, 1 * 64(mem_cd); \
+ vmovdqu64 y2, 2 * 64(mem_cd); \
+ vmovdqu64 y3, 3 * 64(mem_cd); \
+ vmovdqu64 y4, 4 * 64(mem_cd); \
+ vmovdqu64 y5, 5 * 64(mem_cd); \
+ vmovdqu64 y6, 6 * 64(mem_cd); \
+ vmovdqu64 y7, 7 * 64(mem_cd);
+
+#define write_output(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ mem) \
+ vmovdqu64 x0, 0 * 64(mem); \
+ vmovdqu64 x1, 1 * 64(mem); \
+ vmovdqu64 x2, 2 * 64(mem); \
+ vmovdqu64 x3, 3 * 64(mem); \
+ vmovdqu64 x4, 4 * 64(mem); \
+ vmovdqu64 x5, 5 * 64(mem); \
+ vmovdqu64 x6, 6 * 64(mem); \
+ vmovdqu64 x7, 7 * 64(mem); \
+ vmovdqu64 y0, 8 * 64(mem); \
+ vmovdqu64 y1, 9 * 64(mem); \
+ vmovdqu64 y2, 10 * 64(mem); \
+ vmovdqu64 y3, 11 * 64(mem); \
+ vmovdqu64 y4, 12 * 64(mem); \
+ vmovdqu64 y5, 13 * 64(mem); \
+ vmovdqu64 y6, 14 * 64(mem); \
+ vmovdqu64 y7, 15 * 64(mem); \
+
+#define aria_store_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 x0, ((idx + 0) * 64)(mem_tmp); \
+ vmovdqu64 x1, ((idx + 1) * 64)(mem_tmp); \
+ vmovdqu64 x2, ((idx + 2) * 64)(mem_tmp); \
+ vmovdqu64 x3, ((idx + 3) * 64)(mem_tmp); \
+ vmovdqu64 x4, ((idx + 4) * 64)(mem_tmp); \
+ vmovdqu64 x5, ((idx + 5) * 64)(mem_tmp); \
+ vmovdqu64 x6, ((idx + 6) * 64)(mem_tmp); \
+ vmovdqu64 x7, ((idx + 7) * 64)(mem_tmp);
+
+#define aria_load_state_8way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ mem_tmp, idx) \
+ vmovdqu64 ((idx + 0) * 64)(mem_tmp), x0; \
+ vmovdqu64 ((idx + 1) * 64)(mem_tmp), x1; \
+ vmovdqu64 ((idx + 2) * 64)(mem_tmp), x2; \
+ vmovdqu64 ((idx + 3) * 64)(mem_tmp), x3; \
+ vmovdqu64 ((idx + 4) * 64)(mem_tmp), x4; \
+ vmovdqu64 ((idx + 5) * 64)(mem_tmp), x5; \
+ vmovdqu64 ((idx + 6) * 64)(mem_tmp), x6; \
+ vmovdqu64 ((idx + 7) * 64)(mem_tmp), x7;
+
+#define aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, rk, round) \
+ /* AddRoundKey */ \
+ vpbroadcastb ((round * 16) + 3)(rk), t0; \
+ vpxorq t0, x0, x0; \
+ vpbroadcastb ((round * 16) + 2)(rk), t0; \
+ vpxorq t0, x1, x1; \
+ vpbroadcastb ((round * 16) + 1)(rk), t0; \
+ vpxorq t0, x2, x2; \
+ vpbroadcastb ((round * 16) + 0)(rk), t0; \
+ vpxorq t0, x3, x3; \
+ vpbroadcastb ((round * 16) + 7)(rk), t0; \
+ vpxorq t0, x4, x4; \
+ vpbroadcastb ((round * 16) + 6)(rk), t0; \
+ vpxorq t0, x5, x5; \
+ vpbroadcastb ((round * 16) + 5)(rk), t0; \
+ vpxorq t0, x6, x6; \
+ vpbroadcastb ((round * 16) + 4)(rk), t0; \
+ vpxorq t0, x7, x7; \
+ vpbroadcastb ((round * 16) + 11)(rk), t0; \
+ vpxorq t0, y0, y0; \
+ vpbroadcastb ((round * 16) + 10)(rk), t0; \
+ vpxorq t0, y1, y1; \
+ vpbroadcastb ((round * 16) + 9)(rk), t0; \
+ vpxorq t0, y2, y2; \
+ vpbroadcastb ((round * 16) + 8)(rk), t0; \
+ vpxorq t0, y3, y3; \
+ vpbroadcastb ((round * 16) + 15)(rk), t0; \
+ vpxorq t0, y4, y4; \
+ vpbroadcastb ((round * 16) + 14)(rk), t0; \
+ vpxorq t0, y5, y5; \
+ vpbroadcastb ((round * 16) + 13)(rk), t0; \
+ vpxorq t0, y6, y6; \
+ vpbroadcastb ((round * 16) + 12)(rk), t0; \
+ vpxorq t0, y7, y7;
+
+#define aria_sbox_8way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7;
+
+#define aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ t0, t1, t2, t3, \
+ t4, t5, t6, t7) \
+ vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \
+ vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \
+ vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \
+ vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \
+ vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \
+ vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \
+ vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \
+ vgf2p8affineinvqb $0, t2, x2, x2; \
+ vgf2p8affineinvqb $0, t2, x6, x6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \
+ vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \
+ vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \
+ vgf2p8affineinvqb $0, t2, x3, x3; \
+ vgf2p8affineinvqb $0, t2, x7, x7; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y1, y1; \
+ vgf2p8affineinvqb $(tf_s2_const), t0, y5, y5; \
+ vgf2p8affineqb $(tf_inv_const), t1, y2, y2; \
+ vgf2p8affineqb $(tf_inv_const), t1, y6, y6; \
+ vgf2p8affineinvqb $0, t2, y2, y2; \
+ vgf2p8affineinvqb $0, t2, y6, y6; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y0, y0; \
+ vgf2p8affineinvqb $(tf_aff_const), t3, y4, y4; \
+ vgf2p8affineqb $(tf_x2_const), t4, y3, y3; \
+ vgf2p8affineqb $(tf_x2_const), t4, y7, y7; \
+ vgf2p8affineinvqb $0, t2, y3, y3; \
+ vgf2p8affineinvqb $0, t2, y7, y7;
+
+
+#define aria_diff_m(x0, x1, x2, x3, \
+ t0, t1, t2, t3) \
+ /* T = rotr32(X, 8); */ \
+ /* X ^= T */ \
+ vpxorq x0, x3, t0; \
+ vpxorq x1, x0, t1; \
+ vpxorq x2, x1, t2; \
+ vpxorq x3, x2, t3; \
+ /* X = T ^ rotr(X, 16); */ \
+ vpxorq t2, x0, x0; \
+ vpxorq x1, t3, t3; \
+ vpxorq t0, x2, x2; \
+ vpxorq t1, x3, x1; \
+ vmovdqu64 t3, x3;
+
+#define aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7) \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7; \
+ \
+ /* t2 ^= t3; */ \
+ vpxorq y4, y0, y0; \
+ vpxorq y5, y1, y1; \
+ vpxorq y6, y2, y2; \
+ vpxorq y7, y3, y3; \
+ \
+ /* t0 ^= t1; */ \
+ vpxorq x4, x0, x0; \
+ vpxorq x5, x1, x1; \
+ vpxorq x6, x2, x2; \
+ vpxorq x7, x3, x3; \
+ \
+ /* t3 ^= t1; */ \
+ vpxorq x4, y4, y4; \
+ vpxorq x5, y5, y5; \
+ vpxorq x6, y6, y6; \
+ vpxorq x7, y7, y7; \
+ \
+ /* t2 ^= t0; */ \
+ vpxorq x0, y0, y0; \
+ vpxorq x1, y1, y1; \
+ vpxorq x2, y2, y2; \
+ vpxorq x3, y3, y3; \
+ \
+ /* t1 ^= t2; */ \
+ vpxorq y0, x4, x4; \
+ vpxorq y1, x5, x5; \
+ vpxorq y2, x6, x6; \
+ vpxorq y3, x7, x7;
+
+#define aria_fe_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T3 = ABCD -> BADC \
+ * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \
+ * T0 = ABCD -> CDAB \
+ * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \
+ * T1 = ABCD -> DCBA \
+ * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \
+ */ \
+ aria_diff_word(x2, x3, x0, x1, \
+ x7, x6, x5, x4, \
+ y0, y1, y2, y3, \
+ y5, y4, y7, y6); \
+
+
+#define aria_fo_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round) \
+ aria_ark_16way(x0, x1, x2, x3, x4, x5, x6, x7, \
+ y0, y1, y2, y3, y4, y5, y6, y7, \
+ z0, rk, round); \
+ \
+ aria_sbox_16way_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ \
+ aria_diff_m(x0, x1, x2, x3, z0, z1, z2, z3); \
+ aria_diff_m(x4, x5, x6, x7, z0, z1, z2, z3); \
+ aria_diff_m(y0, y1, y2, y3, z0, z1, z2, z3); \
+ aria_diff_m(y4, y5, y6, y7, z0, z1, z2, z3); \
+ aria_diff_word(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7); \
+ /* aria_diff_byte() \
+ * T1 = ABCD -> BADC \
+ * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \
+ * T2 = ABCD -> CDAB \
+ * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \
+ * T3 = ABCD -> DCBA \
+ * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \
+ */ \
+ aria_diff_word(x0, x1, x2, x3, \
+ x5, x4, x7, x6, \
+ y2, y3, y0, y1, \
+ y7, y6, y5, y4);
+
+#define aria_ff_gfni(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7, \
+ mem_tmp, rk, round, last_round) \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, round); \
+ aria_sbox_16way_gfni(x2, x3, x0, x1, \
+ x6, x7, x4, x5, \
+ y2, y3, y0, y1, \
+ y6, y7, y4, y5, \
+ z0, z1, z2, z3, \
+ z4, z5, z6, z7); \
+ aria_ark_16way(x0, x1, x2, x3, \
+ x4, x5, x6, x7, \
+ y0, y1, y2, y3, \
+ y4, y5, y6, y7, \
+ z0, rk, last_round);
+
+
+.section .rodata.cst64, "aM", @progbits, 64
+.align 64
+.Lcounter0123_lo:
+ .quad 0, 0
+ .quad 1, 0
+ .quad 2, 0
+ .quad 3, 0
+
+.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
+.align 32
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.section .rodata.cst16, "aM", @progbits, 16
+.align 16
+
+.Lcounter4444_lo:
+ .quad 4, 0
+.Lcounter8888_lo:
+ .quad 8, 0
+.Lcounter16161616_lo:
+ .quad 16, 0
+.Lcounter1111_hi:
+ .quad 0, 1
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+ .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+
+.section .rodata.cst8, "aM", @progbits, 8
+.align 8
+/* AES affine: */
+#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0)
+.Ltf_aff_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1),
+ BV8(1, 1, 0, 0, 0, 1, 1, 1),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 0, 0, 1),
+ BV8(1, 1, 1, 1, 1, 0, 0, 0),
+ BV8(0, 1, 1, 1, 1, 1, 0, 0),
+ BV8(0, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 1, 1, 1))
+
+/* AES inverse affine: */
+#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0)
+.Ltf_inv_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 1, 0, 1, 0, 0, 1, 0),
+ BV8(0, 0, 1, 0, 1, 0, 0, 1),
+ BV8(1, 0, 0, 1, 0, 1, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 1, 0))
+
+/* S2: */
+#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1)
+.Ltf_s2_bitmatrix:
+ .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1),
+ BV8(0, 0, 1, 1, 1, 1, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(1, 1, 0, 0, 0, 0, 1, 1),
+ BV8(0, 1, 0, 0, 0, 0, 1, 1),
+ BV8(1, 1, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 1, 0, 1, 1, 0))
+
+/* X2: */
+#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0)
+.Ltf_x2_bitmatrix:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 1, 1, 0),
+ BV8(0, 0, 0, 0, 1, 0, 1, 0),
+ BV8(1, 1, 1, 0, 0, 0, 1, 1),
+ BV8(1, 1, 1, 0, 1, 1, 0, 0),
+ BV8(0, 1, 1, 0, 1, 0, 1, 1),
+ BV8(1, 0, 1, 1, 1, 1, 0, 1),
+ BV8(1, 0, 0, 1, 0, 0, 1, 1))
+
+/* Identity matrix: */
+.Ltf_id_bitmatrix:
+ .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0),
+ BV8(0, 1, 0, 0, 0, 0, 0, 0),
+ BV8(0, 0, 1, 0, 0, 0, 0, 0),
+ BV8(0, 0, 0, 1, 0, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 0, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 1, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+.text
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_crypt_64way)
+ /* input:
+ * %r9: rk
+ * %rsi: dst
+ * %rdx: src
+ * %zmm0..%zmm15: byte-sliced blocks
+ */
+
+ FRAME_BEGIN
+
+ movq %rsi, %rax;
+ leaq 8 * 64(%rax), %r8;
+
+ inpack16_post(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax, %r8);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 0);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 1);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 2);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 3);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 4);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 5);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 6);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 7);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 8);
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 9);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 10);
+ cmpl $12, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_192;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11, 12);
+ jmp .Laria_gfni_end;
+.Laria_gfni_192:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 11);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 12);
+ cmpl $14, ARIA_CTX_rounds(CTX);
+ jne .Laria_gfni_256;
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13, 14);
+ jmp .Laria_gfni_end;
+.Laria_gfni_256:
+ aria_fe_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 13);
+ aria_fo_gfni(%zmm0, %zmm1, %zmm2, %zmm3,
+ %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 14);
+ aria_ff_gfni(%zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10,
+ %zmm12, %zmm13, %zmm14, %zmm15,
+ %zmm24, %zmm25, %zmm26, %zmm27,
+ %zmm28, %zmm29, %zmm30, %zmm31,
+ %rax, %r9, 15, 16);
+.Laria_gfni_end:
+ debyteslice_16x16b(%zmm9, %zmm12, %zmm3, %zmm6,
+ %zmm8, %zmm13, %zmm2, %zmm7,
+ %zmm11, %zmm14, %zmm1, %zmm4,
+ %zmm10, %zmm15, %zmm0, %zmm5,
+ (%rax), (%r8));
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_crypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_encrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_encrypt_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_decrypt_64way)
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ FRAME_BEGIN
+
+ leaq ARIA_CTX_dec_key(CTX), %r9;
+
+ inpack16_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx);
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rax);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_decrypt_64way)
+
+SYM_FUNC_START_LOCAL(__aria_gfni_avx512_ctr_gen_keystream_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+
+ FRAME_BEGIN
+
+ vbroadcasti64x2 .Lbswap128_mask (%rip), %zmm19;
+ vmovdqa64 .Lcounter0123_lo (%rip), %zmm21;
+ vbroadcasti64x2 .Lcounter4444_lo (%rip), %zmm22;
+ vbroadcasti64x2 .Lcounter8888_lo (%rip), %zmm23;
+ vbroadcasti64x2 .Lcounter16161616_lo (%rip), %zmm24;
+ vbroadcasti64x2 .Lcounter1111_hi (%rip), %zmm25;
+
+ /* load IV and byteswap */
+ movq 8(%r8), %r11;
+ movq (%r8), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ vbroadcasti64x2 (%r8), %zmm20;
+ vpshufb %zmm19, %zmm20, %zmm20;
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 64), %r11;
+ ja .Lload_ctr_carry;
+
+ /* construct IVs */
+ vpaddq %zmm21, %zmm20, %zmm0; /* +0:+1:+2:+3 */
+ vpaddq %zmm22, %zmm0, %zmm1; /* +4:+5:+6:+7 */
+ vpaddq %zmm23, %zmm0, %zmm2; /* +8:+9:+10:+11 */
+ vpaddq %zmm23, %zmm1, %zmm3; /* +12:+13:+14:+15 */
+ vpaddq %zmm24, %zmm0, %zmm4; /* +16... */
+ vpaddq %zmm24, %zmm1, %zmm5; /* +20... */
+ vpaddq %zmm24, %zmm2, %zmm6; /* +24... */
+ vpaddq %zmm24, %zmm3, %zmm7; /* +28... */
+ vpaddq %zmm24, %zmm4, %zmm8; /* +32... */
+ vpaddq %zmm24, %zmm5, %zmm9; /* +36... */
+ vpaddq %zmm24, %zmm6, %zmm10; /* +40... */
+ vpaddq %zmm24, %zmm7, %zmm11; /* +44... */
+ vpaddq %zmm24, %zmm8, %zmm12; /* +48... */
+ vpaddq %zmm24, %zmm9, %zmm13; /* +52... */
+ vpaddq %zmm24, %zmm10, %zmm14; /* +56... */
+ vpaddq %zmm24, %zmm11, %zmm15; /* +60... */
+ jmp .Lload_ctr_done;
+
+.Lload_ctr_carry:
+ /* construct IVs */
+ add_le128(%zmm0, %zmm20, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+ add_le128(%zmm1, %zmm0, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+ add_le128(%zmm2, %zmm0, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+ add_le128(%zmm3, %zmm1, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+ add_le128(%zmm4, %zmm0, %zmm24, %zmm25); /* +16... */
+ add_le128(%zmm5, %zmm1, %zmm24, %zmm25); /* +20... */
+ add_le128(%zmm6, %zmm2, %zmm24, %zmm25); /* +24... */
+ add_le128(%zmm7, %zmm3, %zmm24, %zmm25); /* +28... */
+ add_le128(%zmm8, %zmm4, %zmm24, %zmm25); /* +32... */
+ add_le128(%zmm9, %zmm5, %zmm24, %zmm25); /* +36... */
+ add_le128(%zmm10, %zmm6, %zmm24, %zmm25); /* +40... */
+ add_le128(%zmm11, %zmm7, %zmm24, %zmm25); /* +44... */
+ add_le128(%zmm12, %zmm8, %zmm24, %zmm25); /* +48... */
+ add_le128(%zmm13, %zmm9, %zmm24, %zmm25); /* +52... */
+ add_le128(%zmm14, %zmm10, %zmm24, %zmm25); /* +56... */
+ add_le128(%zmm15, %zmm11, %zmm24, %zmm25); /* +60... */
+
+.Lload_ctr_done:
+ /* Byte-swap IVs and update counter. */
+ addq $64, %r11;
+ adcq $0, %r10;
+ vpshufb %zmm19, %zmm15, %zmm15;
+ vpshufb %zmm19, %zmm14, %zmm14;
+ vpshufb %zmm19, %zmm13, %zmm13;
+ vpshufb %zmm19, %zmm12, %zmm12;
+ vpshufb %zmm19, %zmm11, %zmm11;
+ vpshufb %zmm19, %zmm10, %zmm10;
+ vpshufb %zmm19, %zmm9, %zmm9;
+ vpshufb %zmm19, %zmm8, %zmm8;
+ bswapq %r11;
+ bswapq %r10;
+ vpshufb %zmm19, %zmm7, %zmm7;
+ vpshufb %zmm19, %zmm6, %zmm6;
+ vpshufb %zmm19, %zmm5, %zmm5;
+ vpshufb %zmm19, %zmm4, %zmm4;
+ vpshufb %zmm19, %zmm3, %zmm3;
+ vpshufb %zmm19, %zmm2, %zmm2;
+ vpshufb %zmm19, %zmm1, %zmm1;
+ vpshufb %zmm19, %zmm0, %zmm0;
+ movq %r11, 8(%r8);
+ movq %r10, (%r8);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(__aria_gfni_avx512_ctr_gen_keystream_64way)
+
+SYM_TYPED_FUNC_START(aria_gfni_avx512_ctr_crypt_64way)
+ /* input:
+ * %rdi: ctx
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: keystream
+ * %r8: iv (big endian, 128bit)
+ */
+ FRAME_BEGIN
+
+ call __aria_gfni_avx512_ctr_gen_keystream_64way
+
+ leaq (%rsi), %r10;
+ leaq (%rdx), %r11;
+ leaq (%rcx), %rsi;
+ leaq (%rcx), %rdx;
+ leaq ARIA_CTX_enc_key(CTX), %r9;
+
+ call __aria_gfni_avx512_crypt_64way;
+
+ vpxorq (0 * 64)(%r11), %zmm3, %zmm3;
+ vpxorq (1 * 64)(%r11), %zmm2, %zmm2;
+ vpxorq (2 * 64)(%r11), %zmm1, %zmm1;
+ vpxorq (3 * 64)(%r11), %zmm0, %zmm0;
+ vpxorq (4 * 64)(%r11), %zmm6, %zmm6;
+ vpxorq (5 * 64)(%r11), %zmm7, %zmm7;
+ vpxorq (6 * 64)(%r11), %zmm4, %zmm4;
+ vpxorq (7 * 64)(%r11), %zmm5, %zmm5;
+ vpxorq (8 * 64)(%r11), %zmm9, %zmm9;
+ vpxorq (9 * 64)(%r11), %zmm8, %zmm8;
+ vpxorq (10 * 64)(%r11), %zmm11, %zmm11;
+ vpxorq (11 * 64)(%r11), %zmm10, %zmm10;
+ vpxorq (12 * 64)(%r11), %zmm12, %zmm12;
+ vpxorq (13 * 64)(%r11), %zmm13, %zmm13;
+ vpxorq (14 * 64)(%r11), %zmm14, %zmm14;
+ vpxorq (15 * 64)(%r11), %zmm15, %zmm15;
+ write_output(%zmm3, %zmm2, %zmm1, %zmm0, %zmm6, %zmm7, %zmm4, %zmm5,
+ %zmm9, %zmm8, %zmm11, %zmm10, %zmm12, %zmm13, %zmm14,
+ %zmm15, %r10);
+
+ FRAME_END
+ RET;
+SYM_FUNC_END(aria_gfni_avx512_ctr_crypt_64way)
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
new file mode 100644
index 000000000000..87a11804fc77
--- /dev/null
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -0,0 +1,254 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX2/AES-NI/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_aesni_avx2_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_encrypt_32way);
+asmlinkage void aria_aesni_avx2_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_decrypt_32way);
+asmlinkage void aria_aesni_avx2_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_ctr_crypt_32way);
+#ifdef CONFIG_AS_GFNI
+asmlinkage void aria_aesni_avx2_gfni_encrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_encrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_decrypt_32way(const void *ctx, u8 *dst,
+ const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_decrypt_32way);
+asmlinkage void aria_aesni_avx2_gfni_ctr_crypt_32way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx2_gfni_ctr_crypt_32way);
+#endif /* CONFIG_AS_GFNI */
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx2_request_ctx {
+ u8 keystream[ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx2_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx2_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx2_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx2_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx2_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx2_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx2_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ecb_encrypt,
+ .decrypt = aria_avx2_ecb_decrypt,
+ }, {
+ .base.cra_name = "__ctr(aria)",
+ .base.cra_driver_name = "__ctr-aria-avx2",
+ .base.cra_priority = 500,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL |
+ CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx2_set_key,
+ .encrypt = aria_avx2_ctr_encrypt,
+ .decrypt = aria_avx2_ctr_encrypt,
+ .init = aria_avx2_init_tfm,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx2_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AES) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX2 or AES-NI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
+ &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ } else {
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way;
+ }
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx2_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx2_init);
+module_exit(aria_avx2_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX2/AES-NI/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-aesni-avx2");
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index c561ea4fefa5..4e1516b76669 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -18,21 +18,33 @@
asmlinkage void aria_aesni_avx_encrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_encrypt_16way);
asmlinkage void aria_aesni_avx_decrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_decrypt_16way);
asmlinkage void aria_aesni_avx_ctr_crypt_16way(const void *ctx, u8 *dst,
const u8 *src,
u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_ctr_crypt_16way);
+#ifdef CONFIG_AS_GFNI
asmlinkage void aria_aesni_avx_gfni_encrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_encrypt_16way);
asmlinkage void aria_aesni_avx_gfni_decrypt_16way(const void *ctx, u8 *dst,
const u8 *src);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_decrypt_16way);
asmlinkage void aria_aesni_avx_gfni_ctr_crypt_16way(const void *ctx, u8 *dst,
const u8 *src,
u8 *keystream, u8 *iv);
+EXPORT_SYMBOL_GPL(aria_aesni_avx_gfni_ctr_crypt_16way);
+#endif /* CONFIG_AS_GFNI */
static struct aria_avx_ops aria_ops;
+struct aria_avx_request_ctx {
+ u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
+};
+
static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
{
ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
@@ -73,6 +85,7 @@ static int aria_avx_set_key(struct crypto_skcipher *tfm, const u8 *key,
static int aria_avx_ctr_encrypt(struct skcipher_request *req)
{
+ struct aria_avx_request_ctx *req_ctx = skcipher_request_ctx(req);
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
@@ -86,10 +99,9 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
u8 *dst = walk.dst.virt.addr;
while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
- u8 keystream[ARIA_AESNI_PARALLEL_BLOCK_SIZE];
-
kernel_fpu_begin();
- aria_ops.aria_ctr_crypt_16way(ctx, dst, src, keystream,
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
walk.iv);
kernel_fpu_end();
dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
@@ -98,28 +110,29 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
}
while (nbytes >= ARIA_BLOCK_SIZE) {
- u8 keystream[ARIA_BLOCK_SIZE];
-
- memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ memcpy(&req_ctx->keystream[0], walk.iv, ARIA_BLOCK_SIZE);
crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
- aria_encrypt(ctx, keystream, keystream);
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
- crypto_xor_cpy(dst, src, keystream, ARIA_BLOCK_SIZE);
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
dst += ARIA_BLOCK_SIZE;
src += ARIA_BLOCK_SIZE;
nbytes -= ARIA_BLOCK_SIZE;
}
if (walk.nbytes == walk.total && nbytes > 0) {
- u8 keystream[ARIA_BLOCK_SIZE];
-
- memcpy(keystream, walk.iv, ARIA_BLOCK_SIZE);
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
- aria_encrypt(ctx, keystream, keystream);
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
- crypto_xor_cpy(dst, src, keystream, nbytes);
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
dst += nbytes;
src += nbytes;
nbytes = 0;
@@ -130,6 +143,13 @@ static int aria_avx_ctr_encrypt(struct skcipher_request *req)
return err;
}
+static int aria_avx_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm, sizeof(struct aria_avx_request_ctx));
+
+ return 0;
+}
+
static struct skcipher_alg aria_algs[] = {
{
.base.cra_name = "__ecb(aria)",
@@ -160,6 +180,7 @@ static struct skcipher_alg aria_algs[] = {
.setkey = aria_avx_set_key,
.encrypt = aria_avx_ctr_encrypt,
.decrypt = aria_avx_ctr_encrypt,
+ .init = aria_avx_init_tfm,
}
};
@@ -182,7 +203,7 @@ static int __init aria_avx_init(void)
return -ENODEV;
}
- if (boot_cpu_has(X86_FEATURE_GFNI)) {
+ if (boot_cpu_has(X86_FEATURE_GFNI) && IS_ENABLED(CONFIG_AS_GFNI)) {
aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c
new file mode 100644
index 000000000000..f4a2208d2638
--- /dev/null
+++ b/arch/x86/crypto/aria_gfni_avx512_glue.c
@@ -0,0 +1,250 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Glue Code for the AVX512/GFNI assembler implementation of the ARIA Cipher
+ *
+ * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com>
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/aria.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include "ecb_cbc_helpers.h"
+#include "aria-avx.h"
+
+asmlinkage void aria_gfni_avx512_encrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_decrypt_64way(const void *ctx, u8 *dst,
+ const u8 *src);
+asmlinkage void aria_gfni_avx512_ctr_crypt_64way(const void *ctx, u8 *dst,
+ const u8 *src,
+ u8 *keystream, u8 *iv);
+
+static struct aria_avx_ops aria_ops;
+
+struct aria_avx512_request_ctx {
+ u8 keystream[ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE];
+};
+
+static int ecb_do_encrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_encrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_encrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_encrypt_16way);
+ ECB_BLOCK(1, aria_encrypt);
+ ECB_WALK_END();
+}
+
+static int ecb_do_decrypt(struct skcipher_request *req, const u32 *rkey)
+{
+ ECB_WALK_START(req, ARIA_BLOCK_SIZE, ARIA_AESNI_PARALLEL_BLOCKS);
+ ECB_BLOCK(ARIA_GFNI_AVX512_PARALLEL_BLOCKS, aria_ops.aria_decrypt_64way);
+ ECB_BLOCK(ARIA_AESNI_AVX2_PARALLEL_BLOCKS, aria_ops.aria_decrypt_32way);
+ ECB_BLOCK(ARIA_AESNI_PARALLEL_BLOCKS, aria_ops.aria_decrypt_16way);
+ ECB_BLOCK(1, aria_decrypt);
+ ECB_WALK_END();
+}
+
+static int aria_avx512_ecb_encrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_encrypt(req, ctx->enc_key[0]);
+}
+
+static int aria_avx512_ecb_decrypt(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return ecb_do_decrypt(req, ctx->dec_key[0]);
+}
+
+static int aria_avx512_set_key(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ return aria_set_key(&tfm->base, key, keylen);
+}
+
+static int aria_avx512_ctr_encrypt(struct skcipher_request *req)
+{
+ struct aria_avx512_request_ctx *req_ctx = skcipher_request_ctx(req);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aria_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ while ((nbytes = walk.nbytes) > 0) {
+ const u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+
+ while (nbytes >= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_64way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ src += ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_GFNI_AVX512_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_32way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_AVX2_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_AESNI_PARALLEL_BLOCK_SIZE) {
+ kernel_fpu_begin();
+ aria_ops.aria_ctr_crypt_16way(ctx, dst, src,
+ &req_ctx->keystream[0],
+ walk.iv);
+ kernel_fpu_end();
+ dst += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ src += ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ nbytes -= ARIA_AESNI_PARALLEL_BLOCK_SIZE;
+ }
+
+ while (nbytes >= ARIA_BLOCK_SIZE) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ ARIA_BLOCK_SIZE);
+ dst += ARIA_BLOCK_SIZE;
+ src += ARIA_BLOCK_SIZE;
+ nbytes -= ARIA_BLOCK_SIZE;
+ }
+
+ if (walk.nbytes == walk.total && nbytes > 0) {
+ memcpy(&req_ctx->keystream[0], walk.iv,
+ ARIA_BLOCK_SIZE);
+ crypto_inc(walk.iv, ARIA_BLOCK_SIZE);
+
+ aria_encrypt(ctx, &req_ctx->keystream[0],
+ &req_ctx->keystream[0]);
+
+ crypto_xor_cpy(dst, src, &req_ctx->keystream[0],
+ nbytes);
+ dst += nbytes;
+ src += nbytes;
+ nbytes = 0;
+ }
+ err = skcipher_walk_done(&walk, nbytes);
+ }
+
+ return err;
+}
+
+static int aria_avx512_init_tfm(struct crypto_skcipher *tfm)
+{
+ crypto_skcipher_set_reqsize(tfm,
+ sizeof(struct aria_avx512_request_ctx));
+
+ return 0;
+}
+
+static struct skcipher_alg aria_algs[] = {
+ {
+ .base.cra_name = "__ecb(aria)",
+ .base.cra_driver_name = "__ecb-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+ .base.cra_blocksize = ARIA_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ecb_encrypt,
+ .decrypt = aria_avx512_ecb_decrypt,
+ }, {
+ .base.cra_name = "__ctr(aria)",
+ .base.cra_driver_name = "__ctr-aria-avx512",
+ .base.cra_priority = 600,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL |
+ CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct aria_ctx),
+ .base.cra_module = THIS_MODULE,
+ .min_keysize = ARIA_MIN_KEY_SIZE,
+ .max_keysize = ARIA_MAX_KEY_SIZE,
+ .ivsize = ARIA_BLOCK_SIZE,
+ .chunksize = ARIA_BLOCK_SIZE,
+ .setkey = aria_avx512_set_key,
+ .encrypt = aria_avx512_ctr_encrypt,
+ .decrypt = aria_avx512_ctr_encrypt,
+ .init = aria_avx512_init_tfm,
+ }
+};
+
+static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
+
+static int __init aria_avx512_init(void)
+{
+ const char *feature_name;
+
+ if (!boot_cpu_has(X86_FEATURE_AVX) ||
+ !boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_AVX512F) ||
+ !boot_cpu_has(X86_FEATURE_AVX512VL) ||
+ !boot_cpu_has(X86_FEATURE_GFNI) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE)) {
+ pr_info("AVX512/GFNI instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, &feature_name)) {
+ pr_info("CPU feature '%s' is not supported.\n", feature_name);
+ return -ENODEV;
+ }
+
+ aria_ops.aria_encrypt_16way = aria_aesni_avx_gfni_encrypt_16way;
+ aria_ops.aria_decrypt_16way = aria_aesni_avx_gfni_decrypt_16way;
+ aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_gfni_ctr_crypt_16way;
+ aria_ops.aria_encrypt_32way = aria_aesni_avx2_gfni_encrypt_32way;
+ aria_ops.aria_decrypt_32way = aria_aesni_avx2_gfni_decrypt_32way;
+ aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_gfni_ctr_crypt_32way;
+ aria_ops.aria_encrypt_64way = aria_gfni_avx512_encrypt_64way;
+ aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way;
+ aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way;
+
+ return simd_register_skciphers_compat(aria_algs,
+ ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+static void __exit aria_avx512_exit(void)
+{
+ simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
+ aria_simd_algs);
+}
+
+module_init(aria_avx512_init);
+module_exit(aria_avx512_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Taehee Yoo <ap420073@gmail.com>");
+MODULE_DESCRIPTION("ARIA Cipher Algorithm, AVX512/GFNI optimized");
+MODULE_ALIAS_CRYPTO("aria");
+MODULE_ALIAS_CRYPTO("aria-gfni-avx512");
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index aaba21230528..0313f9673f56 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -8,7 +8,6 @@
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/sizes.h>
#include <asm/cpufeature.h>
@@ -72,6 +71,4 @@ static int __init blake2s_mod_init(void)
return 0;
}
-module_init(blake2s_mod_init);
-
-MODULE_LICENSE("GPL v2");
+subsys_initcall(blake2s_mod_init);
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 4a43e072d2d1..e88c8e4f013c 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -6,7 +6,6 @@
*/
#include <linux/linkage.h>
-#include <linux/cfi_types.h>
.file "blowfish-x86_64-asm.S"
.text
@@ -100,16 +99,11 @@
bswapq RX0; \
movq RX0, (RIO);
-#define xor_block() \
- bswapq RX0; \
- xorq RX0, (RIO);
-
-SYM_FUNC_START(__blowfish_enc_blk)
+SYM_FUNC_START(blowfish_enc_blk)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
- * %rcx: bool, if true: xor output
*/
movq %r12, %r11;
@@ -130,19 +124,13 @@ SYM_FUNC_START(__blowfish_enc_blk)
add_roundkey_enc(16);
movq %r11, %r12;
-
movq %r10, RIO;
- test %cl, %cl;
- jnz .L__enc_xor;
write_block();
RET;
-.L__enc_xor:
- xor_block();
- RET;
-SYM_FUNC_END(__blowfish_enc_blk)
+SYM_FUNC_END(blowfish_enc_blk)
-SYM_TYPED_FUNC_START(blowfish_dec_blk)
+SYM_FUNC_START(blowfish_dec_blk)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -272,28 +260,26 @@ SYM_FUNC_END(blowfish_dec_blk)
movq RX3, 24(RIO);
#define xor_block4() \
- bswapq RX0; \
- xorq RX0, (RIO); \
+ movq (RIO), RT0; \
+ bswapq RT0; \
+ xorq RT0, RX1; \
\
- bswapq RX1; \
- xorq RX1, 8(RIO); \
+ movq 8(RIO), RT2; \
+ bswapq RT2; \
+ xorq RT2, RX2; \
\
- bswapq RX2; \
- xorq RX2, 16(RIO); \
- \
- bswapq RX3; \
- xorq RX3, 24(RIO);
+ movq 16(RIO), RT3; \
+ bswapq RT3; \
+ xorq RT3, RX3;
-SYM_FUNC_START(__blowfish_enc_blk_4way)
+SYM_FUNC_START(blowfish_enc_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
- * %rcx: bool, if true: xor output
*/
pushq %r12;
pushq %rbx;
- pushq %rcx;
movq %rdi, CTX
movq %rsi, %r11;
@@ -313,37 +299,28 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
round_enc4(14);
add_preloaded_roundkey4();
- popq %r12;
movq %r11, RIO;
-
- test %r12b, %r12b;
- jnz .L__enc_xor4;
-
write_block4();
popq %rbx;
popq %r12;
RET;
+SYM_FUNC_END(blowfish_enc_blk_4way)
-.L__enc_xor4:
- xor_block4();
-
- popq %rbx;
- popq %r12;
- RET;
-SYM_FUNC_END(__blowfish_enc_blk_4way)
-
-SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
+SYM_FUNC_START(__blowfish_dec_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
* %rdx: src
+ * %rcx: cbc (bool)
*/
pushq %r12;
pushq %rbx;
+ pushq %rcx;
+ pushq %rdx;
movq %rdi, CTX;
- movq %rsi, %r11
+ movq %rsi, %r11;
movq %rdx, RIO;
preload_roundkey_dec(17);
@@ -359,6 +336,14 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
round_dec4(3);
add_preloaded_roundkey4();
+ popq RIO;
+ popq %r12;
+ testq %r12, %r12;
+ jz .L_no_cbc_xor;
+
+ xor_block4();
+
+.L_no_cbc_xor:
movq %r11, RIO;
write_block4();
@@ -366,4 +351,4 @@ SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
popq %r12;
RET;
-SYM_FUNC_END(blowfish_dec_blk_4way)
+SYM_FUNC_END(__blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c
index 019c64c1340a..552f2df0643f 100644
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -16,26 +16,28 @@
#include <linux/module.h>
#include <linux/types.h>
+#include "ecb_cbc_helpers.h"
+
/* regular block cipher functions */
-asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
- bool xor);
+asmlinkage void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
/* 4-way parallel cipher functions */
-asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+asmlinkage void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
const u8 *src);
+asmlinkage void __blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src, bool cbc);
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+static inline void blowfish_dec_ecb_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
{
- __blowfish_enc_blk(ctx, dst, src, false);
+ return __blowfish_dec_blk_4way(ctx, dst, src, false);
}
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void blowfish_dec_cbc_4way(struct bf_ctx *ctx, u8 *dst,
+ const u8 *src)
{
- __blowfish_enc_blk_4way(ctx, dst, src, false);
+ return __blowfish_dec_blk_4way(ctx, dst, src, true);
}
static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
@@ -54,183 +56,35 @@ static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
return blowfish_setkey(&tfm->base, key, keylen);
}
-static int ecb_crypt(struct skcipher_request *req,
- void (*fn)(struct bf_ctx *, u8 *, const u8 *),
- void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while ((nbytes = walk.nbytes)) {
- u8 *wsrc = walk.src.virt.addr;
- u8 *wdst = walk.dst.virt.addr;
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- fn_4way(ctx, wdst, wsrc);
-
- wsrc += bsize * 4;
- wdst += bsize * 4;
- nbytes -= bsize * 4;
- } while (nbytes >= bsize * 4);
-
- if (nbytes < bsize)
- goto done;
- }
-
- /* Handle leftovers */
- do {
- fn(ctx, wdst, wsrc);
-
- wsrc += bsize;
- wdst += bsize;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
-done:
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
static int ecb_encrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way);
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_enc_blk_4way);
+ ECB_BLOCK(1, blowfish_enc_blk);
+ ECB_WALK_END();
}
static int ecb_decrypt(struct skcipher_request *req)
{
- return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way);
-}
-
-static unsigned int __cbc_encrypt(struct bf_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 *iv = (u64 *)walk->iv;
-
- do {
- *dst = *src ^ *iv;
- blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
- iv = dst;
-
- src += 1;
- dst += 1;
- nbytes -= bsize;
- } while (nbytes >= bsize);
-
- *(u64 *)walk->iv = *iv;
- return nbytes;
+ ECB_WALK_START(req, BF_BLOCK_SIZE, -1);
+ ECB_BLOCK(4, blowfish_dec_ecb_4way);
+ ECB_BLOCK(1, blowfish_dec_blk);
+ ECB_WALK_END();
}
static int cbc_encrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while (walk.nbytes) {
- nbytes = __cbc_encrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static unsigned int __cbc_decrypt(struct bf_ctx *ctx,
- struct skcipher_walk *walk)
-{
- unsigned int bsize = BF_BLOCK_SIZE;
- unsigned int nbytes = walk->nbytes;
- u64 *src = (u64 *)walk->src.virt.addr;
- u64 *dst = (u64 *)walk->dst.virt.addr;
- u64 ivs[4 - 1];
- u64 last_iv;
-
- /* Start of the last block. */
- src += nbytes / bsize - 1;
- dst += nbytes / bsize - 1;
-
- last_iv = *src;
-
- /* Process four block batch */
- if (nbytes >= bsize * 4) {
- do {
- nbytes -= bsize * 4 - bsize;
- src -= 4 - 1;
- dst -= 4 - 1;
-
- ivs[0] = src[0];
- ivs[1] = src[1];
- ivs[2] = src[2];
-
- blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
-
- dst[1] ^= ivs[0];
- dst[2] ^= ivs[1];
- dst[3] ^= ivs[2];
-
- nbytes -= bsize;
- if (nbytes < bsize)
- goto done;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- } while (nbytes >= bsize * 4);
- }
-
- /* Handle leftovers */
- for (;;) {
- blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
-
- nbytes -= bsize;
- if (nbytes < bsize)
- break;
-
- *dst ^= *(src - 1);
- src -= 1;
- dst -= 1;
- }
-
-done:
- *dst ^= *(u64 *)walk->iv;
- *(u64 *)walk->iv = last_iv;
-
- return nbytes;
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_ENC_BLOCK(blowfish_enc_blk);
+ CBC_WALK_END();
}
static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- while (walk.nbytes) {
- nbytes = __cbc_decrypt(ctx, &walk);
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
+ CBC_WALK_START(req, BF_BLOCK_SIZE, -1);
+ CBC_DEC_BLOCK(4, blowfish_dec_cbc_4way);
+ CBC_DEC_BLOCK(1, blowfish_dec_blk);
+ CBC_WALK_END();
}
static struct crypto_alg bf_cipher_alg = {
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index 2e1658ddbe1a..646477a13e11 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -52,10 +52,10 @@
/* \
* S-function with AES subbytes \
*/ \
- vmovdqa .Linv_shift_row, t4; \
- vbroadcastss .L0f0f0f0f, t7; \
- vmovdqa .Lpre_tf_lo_s1, t0; \
- vmovdqa .Lpre_tf_hi_s1, t1; \
+ vmovdqa .Linv_shift_row(%rip), t4; \
+ vbroadcastss .L0f0f0f0f(%rip), t7; \
+ vmovdqa .Lpre_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpre_tf_hi_s1(%rip), t1; \
\
/* AES inverse shift rows */ \
vpshufb t4, x0, x0; \
@@ -68,8 +68,8 @@
vpshufb t4, x6, x6; \
\
/* prefilter sboxes 1, 2 and 3 */ \
- vmovdqa .Lpre_tf_lo_s4, t2; \
- vmovdqa .Lpre_tf_hi_s4, t3; \
+ vmovdqa .Lpre_tf_lo_s4(%rip), t2; \
+ vmovdqa .Lpre_tf_hi_s4(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x1, t0, t1, t7, t6); \
@@ -83,8 +83,8 @@
filter_8bit(x6, t2, t3, t7, t6); \
\
/* AES subbytes + AES shift rows */ \
- vmovdqa .Lpost_tf_lo_s1, t0; \
- vmovdqa .Lpost_tf_hi_s1, t1; \
+ vmovdqa .Lpost_tf_lo_s1(%rip), t0; \
+ vmovdqa .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4, x0, x0; \
vaesenclast t4, x7, x7; \
vaesenclast t4, x1, x1; \
@@ -95,16 +95,16 @@
vaesenclast t4, x6, x6; \
\
/* postfilter sboxes 1 and 4 */ \
- vmovdqa .Lpost_tf_lo_s3, t2; \
- vmovdqa .Lpost_tf_hi_s3, t3; \
+ vmovdqa .Lpost_tf_lo_s3(%rip), t2; \
+ vmovdqa .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \
\
/* postfilter sbox 3 */ \
- vmovdqa .Lpost_tf_lo_s2, t4; \
- vmovdqa .Lpost_tf_hi_s2, t5; \
+ vmovdqa .Lpost_tf_lo_s2(%rip), t4; \
+ vmovdqa .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
@@ -443,7 +443,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vmovdqu .Lshufb_16x16b, a0; \
+ vmovdqu .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -482,7 +482,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
vmovq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor 0 * 16(rio), x0, y7; \
vpxor 1 * 16(rio), x0, y6; \
@@ -533,7 +533,7 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \
\
vmovq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor x0, y7, y7; \
vpxor x0, y6, y6; \
@@ -712,7 +712,6 @@ SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.text
-.align 8
SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
/* input:
* %rdi: ctx, CTX
@@ -799,7 +798,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk16)
jmp .Lenc_done;
SYM_FUNC_END(__camellia_enc_blk16)
-.align 8
SYM_FUNC_START_LOCAL(__camellia_dec_blk16)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0e4e9abbf4de..a0eb94e53b1b 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -64,12 +64,12 @@
/* \
* S-function with AES subbytes \
*/ \
- vbroadcasti128 .Linv_shift_row, t4; \
- vpbroadcastd .L0f0f0f0f, t7; \
- vbroadcasti128 .Lpre_tf_lo_s1, t5; \
- vbroadcasti128 .Lpre_tf_hi_s1, t6; \
- vbroadcasti128 .Lpre_tf_lo_s4, t2; \
- vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+ vbroadcasti128 .Linv_shift_row(%rip), t4; \
+ vpbroadcastd .L0f0f0f0f(%rip), t7; \
+ vbroadcasti128 .Lpre_tf_lo_s1(%rip), t5; \
+ vbroadcasti128 .Lpre_tf_hi_s1(%rip), t6; \
+ vbroadcasti128 .Lpre_tf_lo_s4(%rip), t2; \
+ vbroadcasti128 .Lpre_tf_hi_s4(%rip), t3; \
\
/* AES inverse shift rows */ \
vpshufb t4, x0, x0; \
@@ -115,8 +115,8 @@
vinserti128 $1, t2##_x, x6, x6; \
vextracti128 $1, x1, t3##_x; \
vextracti128 $1, x4, t2##_x; \
- vbroadcasti128 .Lpost_tf_lo_s1, t0; \
- vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+ vbroadcasti128 .Lpost_tf_lo_s1(%rip), t0; \
+ vbroadcasti128 .Lpost_tf_hi_s1(%rip), t1; \
vaesenclast t4##_x, x2##_x, x2##_x; \
vaesenclast t4##_x, t6##_x, t6##_x; \
vinserti128 $1, t6##_x, x2, x2; \
@@ -131,16 +131,16 @@
vinserti128 $1, t2##_x, x4, x4; \
\
/* postfilter sboxes 1 and 4 */ \
- vbroadcasti128 .Lpost_tf_lo_s3, t2; \
- vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+ vbroadcasti128 .Lpost_tf_lo_s3(%rip), t2; \
+ vbroadcasti128 .Lpost_tf_hi_s3(%rip), t3; \
filter_8bit(x0, t0, t1, t7, t6); \
filter_8bit(x7, t0, t1, t7, t6); \
filter_8bit(x3, t0, t1, t7, t6); \
filter_8bit(x6, t0, t1, t7, t6); \
\
/* postfilter sbox 3 */ \
- vbroadcasti128 .Lpost_tf_lo_s2, t4; \
- vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+ vbroadcasti128 .Lpost_tf_lo_s2(%rip), t4; \
+ vbroadcasti128 .Lpost_tf_hi_s2(%rip), t5; \
filter_8bit(x2, t2, t3, t7, t6); \
filter_8bit(x5, t2, t3, t7, t6); \
\
@@ -221,7 +221,6 @@
* Size optimization... with inlined roundsm32 binary would be over 5 times
* larger and would only marginally faster.
*/
-.align 8
SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
@@ -229,7 +228,6 @@ SYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_c
RET;
SYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
-.align 8
SYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
%ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
@@ -477,7 +475,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
transpose_4x4(c0, c1, c2, c3, a0, a1); \
transpose_4x4(d0, d1, d2, d3, a0, a1); \
\
- vbroadcasti128 .Lshufb_16x16b, a0; \
+ vbroadcasti128 .Lshufb_16x16b(%rip), a0; \
vmovdqu st1, a1; \
vpshufb a0, a2, a2; \
vpshufb a0, a3, a3; \
@@ -516,7 +514,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
y6, y7, rio, key) \
vpbroadcastq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor 0 * 32(rio), x0, y7; \
vpxor 1 * 32(rio), x0, y6; \
@@ -567,7 +565,7 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
vmovdqu x0, stack_tmp0; \
\
vpbroadcastq key, x0; \
- vpshufb .Lpack_bswap, x0, x0; \
+ vpshufb .Lpack_bswap(%rip), x0, x0; \
\
vpxor x0, y7, y7; \
vpxor x0, y6, y6; \
@@ -748,7 +746,6 @@ SYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
.text
-.align 8
SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
/* input:
* %rdi: ctx, CTX
@@ -835,7 +832,6 @@ SYM_FUNC_START_LOCAL(__camellia_enc_blk32)
jmp .Lenc_done;
SYM_FUNC_END(__camellia_enc_blk32)
-.align 8
SYM_FUNC_START_LOCAL(__camellia_dec_blk32)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 347c059f5940..816b6bb8bded 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -77,11 +77,13 @@
#define RXORbl %r9b
#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
+ leaq T0(%rip), tmp1; \
movzbl ab ## bl, tmp2 ## d; \
+ xorq (tmp1, tmp2, 8), dst; \
+ leaq T1(%rip), tmp2; \
movzbl ab ## bh, tmp1 ## d; \
rorq $16, ab; \
- xorq T0(, tmp2, 8), dst; \
- xorq T1(, tmp1, 8), dst;
+ xorq (tmp2, tmp1, 8), dst;
/**********************************************************************
1-way camellia
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b258af420c92..b4e460a87f18 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -84,15 +84,19 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
shrq $16, src; \
- movl s1(, RID1, 4), dst ## d; \
- op1 s2(, RID2, 4), dst ## d; \
movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
interleave_op(il_reg); \
- op2 s3(, RID1, 4), dst ## d; \
- op3 s4(, RID2, 4), dst ## d;
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
#define dummy(d) /* do nothing */
@@ -151,15 +155,15 @@
subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
#define enc_preload_rkr() \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor kr(CTX), RKR, RKR;
#define dec_preload_rkr() \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor kr(CTX), RKR, RKR; \
- vpshufb .Lbswap128_mask, RKR, RKR;
+ vpshufb .Lbswap128_mask(%rip), RKR, RKR;
#define transpose_2x4(x0, x1, t0, t1) \
vpunpckldq x1, x0, t0; \
@@ -208,7 +212,6 @@
.text
-.align 16
SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
/* input:
* %rdi: ctx
@@ -236,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
enc_preload_rkr();
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -272,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -282,7 +285,6 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16)
RET;
SYM_FUNC_END(__cast5_enc_blk16)
-.align 16
SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
/* input:
* %rdi: ctx
@@ -310,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
dec_preload_rkr();
inpack_blocks(RL1, RR1, RTMP, RX, RKM);
@@ -343,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16)
round(RL, RR, 1, 2);
round(RR, RL, 0, 1);
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
popq %rbx;
popq %r15;
@@ -506,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way)
vpcmpeqd RKR, RKR, RKR;
vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
- vmovdqa .Lbswap_iv_mask, R1ST;
- vmovdqa .Lbswap128_mask, RKM;
+ vmovdqa .Lbswap_iv_mask(%rip), R1ST;
+ vmovdqa .Lbswap128_mask(%rip), RKM;
/* load IV and byteswap */
vmovq (%rcx), RX;
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 82b716fd5dba..9e86d460b409 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -84,15 +84,19 @@
#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
movzbl src ## bh, RID1d; \
+ leaq s1(%rip), RID2; \
+ movl (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
+ leaq s2(%rip), RID1; \
+ op1 (RID1,RID2,4), dst ## d; \
shrq $16, src; \
- movl s1(, RID1, 4), dst ## d; \
- op1 s2(, RID2, 4), dst ## d; \
movzbl src ## bh, RID1d; \
+ leaq s3(%rip), RID2; \
+ op2 (RID2,RID1,4), dst ## d; \
movzbl src ## bl, RID2d; \
interleave_op(il_reg); \
- op2 s3(, RID1, 4), dst ## d; \
- op3 s4(, RID2, 4), dst ## d;
+ leaq s4(%rip), RID1; \
+ op3 (RID1,RID2,4), dst ## d;
#define dummy(d) /* do nothing */
@@ -175,10 +179,10 @@
qop(RD, RC, 1);
#define shuffle(mask) \
- vpshufb mask, RKR, RKR;
+ vpshufb mask(%rip), RKR, RKR;
#define preload_rkr(n, do_mask, mask) \
- vbroadcastss .L16_mask, RKR; \
+ vbroadcastss .L16_mask(%rip), RKR; \
/* add 16-bit rotation to key rotations (mod 32) */ \
vpxor (kr+n*16)(CTX), RKR, RKR; \
do_mask(mask);
@@ -258,9 +262,9 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -284,7 +288,7 @@ SYM_FUNC_START_LOCAL(__cast6_enc_blk8)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -306,9 +310,9 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
movq %rdi, CTX;
- vmovdqa .Lbswap_mask, RKM;
- vmovd .Lfirst_mask, R1ST;
- vmovd .L32_mask, R32;
+ vmovdqa .Lbswap_mask(%rip), RKM;
+ vmovd .Lfirst_mask(%rip), R1ST;
+ vmovd .L32_mask(%rip), R32;
inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
@@ -332,7 +336,7 @@ SYM_FUNC_START_LOCAL(__cast6_dec_blk8)
popq %rbx;
popq %r15;
- vmovdqa .Lbswap_mask, RKM;
+ vmovdqa .Lbswap_mask(%rip), RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index ca53e96996ac..5d31137e2c7d 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -90,7 +90,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne
sub $0x40, LEN
add $0x40, BUF
cmp $0x40, LEN
- jb less_64
+ jb .Lless_64
#ifdef __x86_64__
movdqa .Lconstant_R2R1(%rip), CONSTANT
@@ -98,7 +98,7 @@ SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligne
movdqa .Lconstant_R2R1, CONSTANT
#endif
-loop_64:/* 64 bytes Full cache line folding */
+.Lloop_64:/* 64 bytes Full cache line folding */
prefetchnta 0x40(BUF)
movdqa %xmm1, %xmm5
movdqa %xmm2, %xmm6
@@ -139,8 +139,8 @@ loop_64:/* 64 bytes Full cache line folding */
sub $0x40, LEN
add $0x40, BUF
cmp $0x40, LEN
- jge loop_64
-less_64:/* Folding cache line into 128bit */
+ jge .Lloop_64
+.Lless_64:/* Folding cache line into 128bit */
#ifdef __x86_64__
movdqa .Lconstant_R4R3(%rip), CONSTANT
#else
@@ -167,8 +167,8 @@ less_64:/* Folding cache line into 128bit */
pxor %xmm4, %xmm1
cmp $0x10, LEN
- jb fold_64
-loop_16:/* Folding rest buffer into 128bit */
+ jb .Lfold_64
+.Lloop_16:/* Folding rest buffer into 128bit */
movdqa %xmm1, %xmm5
pclmulqdq $0x00, CONSTANT, %xmm1
pclmulqdq $0x11, CONSTANT, %xmm5
@@ -177,9 +177,9 @@ loop_16:/* Folding rest buffer into 128bit */
sub $0x10, LEN
add $0x10, BUF
cmp $0x10, LEN
- jge loop_16
+ jge .Lloop_16
-fold_64:
+.Lfold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index ec35915f0901..81ce0f4db555 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -49,15 +49,15 @@
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
.macro LABEL prefix n
-\prefix\n\():
+.L\prefix\n\():
.endm
.macro JMPTBL_ENTRY i
-.quad crc_\i
+.quad .Lcrc_\i
.endm
.macro JNC_LESS_THAN j
- jnc less_than_\j
+ jnc .Lless_than_\j
.endm
# Define threshold where buffers are considered "small" and routed to more
@@ -108,30 +108,30 @@ SYM_FUNC_START(crc_pcl)
neg %bufp
and $7, %bufp # calculate the unalignment amount of
# the address
- je proc_block # Skip if aligned
+ je .Lproc_block # Skip if aligned
## If len is less than 8 and we're unaligned, we need to jump
## to special code to avoid reading beyond the end of the buffer
cmp $8, len
- jae do_align
+ jae .Ldo_align
# less_than_8 expects length in upper 3 bits of len_dw
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
shl $32-3+1, len_dw
- jmp less_than_8_post_shl1
+ jmp .Lless_than_8_post_shl1
-do_align:
+.Ldo_align:
#### Calculate CRC of unaligned bytes of the buffer (if any)
movq (bufptmp), tmp # load a quadward from the buffer
add %bufp, bufptmp # align buffer pointer for quadword
# processing
sub %bufp, len # update buffer length
-align_loop:
+.Lalign_loop:
crc32b %bl, crc_init_dw # compute crc32 of 1-byte
shr $8, tmp # get next byte
dec %bufp
- jne align_loop
+ jne .Lalign_loop
-proc_block:
+.Lproc_block:
################################################################
## 2) PROCESS BLOCKS:
@@ -141,11 +141,11 @@ proc_block:
movq len, tmp # save num bytes in tmp
cmpq $128*24, len
- jae full_block
+ jae .Lfull_block
-continue_block:
+.Lcontinue_block:
cmpq $SMALL_SIZE, len
- jb small
+ jb .Lsmall
## len < 128*24
movq $2731, %rax # 2731 = ceil(2^16 / 24)
@@ -168,13 +168,14 @@ continue_block:
xor crc2, crc2
## branch into array
- mov jump_table(,%rax,8), %bufp
+ leaq jump_table(%rip), %bufp
+ mov (%bufp,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
## 2a) PROCESS FULL BLOCKS:
################################################################
-full_block:
+.Lfull_block:
movl $128,%eax
lea 128*8*2(block_0), block_1
lea 128*8*3(block_0), block_2
@@ -189,7 +190,6 @@ full_block:
## 3) CRC Array:
################################################################
-crc_array:
i=128
.rept 128-1
.altmacro
@@ -242,28 +242,28 @@ LABEL crc_ 0
ENDBR
mov tmp, len
cmp $128*24, tmp
- jae full_block
+ jae .Lfull_block
cmp $24, tmp
- jae continue_block
+ jae .Lcontinue_block
-less_than_24:
+.Lless_than_24:
shl $32-4, len_dw # less_than_16 expects length
# in upper 4 bits of len_dw
- jnc less_than_16
+ jnc .Lless_than_16
crc32q (bufptmp), crc_init
crc32q 8(bufptmp), crc_init
- jz do_return
+ jz .Ldo_return
add $16, bufptmp
# len is less than 8 if we got here
# less_than_8 expects length in upper 3 bits of len_dw
# less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
shl $2, len_dw
- jmp less_than_8_post_shl1
+ jmp .Lless_than_8_post_shl1
#######################################################################
## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
#######################################################################
-small:
+.Lsmall:
shl $32-8, len_dw # Prepare len_dw for less_than_256
j=256
.rept 5 # j = {256, 128, 64, 32, 16}
@@ -279,32 +279,32 @@ LABEL less_than_ %j # less_than_j: Length should be in
crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
i=i+8
.endr
- jz do_return # Return if remaining length is zero
+ jz .Ldo_return # Return if remaining length is zero
add $j, bufptmp # Advance buf
.endr
-less_than_8: # Length should be stored in
+.Lless_than_8: # Length should be stored in
# upper 3 bits of len_dw
shl $1, len_dw
-less_than_8_post_shl1:
- jnc less_than_4
+.Lless_than_8_post_shl1:
+ jnc .Lless_than_4
crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
- jz do_return # return if remaining data is zero
+ jz .Ldo_return # return if remaining data is zero
add $4, bufptmp
-less_than_4: # Length should be stored in
+.Lless_than_4: # Length should be stored in
# upper 2 bits of len_dw
shl $1, len_dw
- jnc less_than_2
+ jnc .Lless_than_2
crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
- jz do_return # return if remaining data is zero
+ jz .Ldo_return # return if remaining data is zero
add $2, bufptmp
-less_than_2: # Length should be stored in the MSB
+.Lless_than_2: # Length should be stored in the MSB
# of len_dw
shl $1, len_dw
- jnc less_than_1
+ jnc .Lless_than_1
crc32b (bufptmp), crc_init_dw # CRC of 1 byte
-less_than_1: # Length should be zero
-do_return:
+.Lless_than_1: # Length should be zero
+.Ldo_return:
movq crc_init, %rax
popq %rsi
popq %rdi
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index 721474abfb71..5286db5b8165 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -94,7 +94,6 @@
#
# Assumes len >= 16.
#
-.align 16
SYM_FUNC_START(crc_t10dif_pcl)
movdqa .Lbswap_mask(%rip), BSWAP_MASK
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index f4c760f4cade..cf21b998e77c 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -129,21 +129,29 @@
movzbl RW0bl, RT2d; \
movzbl RW0bh, RT3d; \
shrq $16, RW0; \
- movq s8(, RT0, 8), RT0; \
- xorq s6(, RT1, 8), to; \
+ leaq s8(%rip), RW1; \
+ movq (RW1, RT0, 8), RT0; \
+ leaq s6(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
movzbl RW0bl, RL1d; \
movzbl RW0bh, RT1d; \
shrl $16, RW0d; \
- xorq s4(, RT2, 8), RT0; \
- xorq s2(, RT3, 8), to; \
+ leaq s4(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
+ leaq s2(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
movzbl RW0bl, RT2d; \
movzbl RW0bh, RT3d; \
- xorq s7(, RL1, 8), RT0; \
- xorq s5(, RT1, 8), to; \
- xorq s3(, RT2, 8), RT0; \
+ leaq s7(%rip), RW1; \
+ xorq (RW1, RL1, 8), RT0; \
+ leaq s5(%rip), RW1; \
+ xorq (RW1, RT1, 8), to; \
+ leaq s3(%rip), RW1; \
+ xorq (RW1, RT2, 8), RT0; \
load_next_key(n, RW0); \
xorq RT0, to; \
- xorq s1(, RT3, 8), to; \
+ leaq s1(%rip), RW1; \
+ xorq (RW1, RT3, 8), to; \
#define load_next_key(n, RWx) \
movq (((n) + 1) * 8)(CTX), RWx;
@@ -355,65 +363,89 @@ SYM_FUNC_END(des3_ede_x86_64_crypt_blk)
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrq $16, RW0; \
- xorq s8(, RT3, 8), to##0; \
- xorq s6(, RT1, 8), to##0; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrq $16, RW0; \
- xorq s4(, RT3, 8), to##0; \
- xorq s2(, RT1, 8), to##0; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
shrl $16, RW0d; \
- xorq s7(, RT3, 8), to##0; \
- xorq s5(, RT1, 8), to##0; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
movzbl RW0bl, RT3d; \
movzbl RW0bh, RT1d; \
load_next_key(n, RW0); \
- xorq s3(, RT3, 8), to##0; \
- xorq s1(, RT1, 8), to##0; \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##0; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##0; \
xorq from##1, RW1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrq $16, RW1; \
- xorq s8(, RT3, 8), to##1; \
- xorq s6(, RT1, 8), to##1; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrq $16, RW1; \
- xorq s4(, RT3, 8), to##1; \
- xorq s2(, RT1, 8), to##1; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
shrl $16, RW1d; \
- xorq s7(, RT3, 8), to##1; \
- xorq s5(, RT1, 8), to##1; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
movzbl RW1bl, RT3d; \
movzbl RW1bh, RT1d; \
do_movq(RW0, RW1); \
- xorq s3(, RT3, 8), to##1; \
- xorq s1(, RT1, 8), to##1; \
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##1; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##1; \
xorq from##2, RW2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrq $16, RW2; \
- xorq s8(, RT3, 8), to##2; \
- xorq s6(, RT1, 8), to##2; \
+ leaq s8(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s6(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrq $16, RW2; \
- xorq s4(, RT3, 8), to##2; \
- xorq s2(, RT1, 8), to##2; \
+ leaq s4(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s2(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
shrl $16, RW2d; \
- xorq s7(, RT3, 8), to##2; \
- xorq s5(, RT1, 8), to##2; \
+ leaq s7(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s5(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2; \
movzbl RW2bl, RT3d; \
movzbl RW2bh, RT1d; \
do_movq(RW0, RW2); \
- xorq s3(, RT3, 8), to##2; \
- xorq s1(, RT1, 8), to##2;
+ leaq s3(%rip), RT2; \
+ xorq (RT2, RT3, 8), to##2; \
+ leaq s1(%rip), RT2; \
+ xorq (RT2, RT1, 8), to##2;
#define __movq(src, dst) \
movq src, dst;
diff --git a/arch/x86/crypto/ecb_cbc_helpers.h b/arch/x86/crypto/ecb_cbc_helpers.h
index eaa15c7b29d6..11955bd01af1 100644
--- a/arch/x86/crypto/ecb_cbc_helpers.h
+++ b/arch/x86/crypto/ecb_cbc_helpers.h
@@ -13,13 +13,14 @@
#define ECB_WALK_START(req, bsize, fpu_blocks) do { \
void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); \
+ const int __fpu_blocks = (fpu_blocks); \
const int __bsize = (bsize); \
struct skcipher_walk walk; \
int err = skcipher_walk_virt(&walk, (req), false); \
while (walk.nbytes > 0) { \
unsigned int nbytes = walk.nbytes; \
- bool do_fpu = (fpu_blocks) != -1 && \
- nbytes >= (fpu_blocks) * __bsize; \
+ bool do_fpu = __fpu_blocks != -1 && \
+ nbytes >= __fpu_blocks * __bsize; \
const u8 *src = walk.src.virt.addr; \
u8 *dst = walk.dst.virt.addr; \
u8 __maybe_unused buf[(bsize)]; \
@@ -35,7 +36,12 @@
} while (0)
#define ECB_BLOCK(blocks, func) do { \
- while (nbytes >= (blocks) * __bsize) { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
(func)(ctx, dst, src); \
ECB_WALK_ADVANCE(blocks); \
} \
@@ -53,7 +59,12 @@
} while (0)
#define CBC_DEC_BLOCK(blocks, func) do { \
- while (nbytes >= (blocks) * __bsize) { \
+ const int __blocks = (blocks); \
+ if (do_fpu && __blocks < __fpu_blocks) { \
+ kernel_fpu_end(); \
+ do_fpu = false; \
+ } \
+ while (nbytes >= __blocks * __bsize) { \
const u8 *__iv = src + ((blocks) - 1) * __bsize; \
if (dst == src) \
__iv = memcpy(buf, __iv, __bsize); \
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 2bf871899920..99cb983ded9e 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -4,7 +4,7 @@
* instructions. This file contains accelerated part of ghash
* implementation. More information about PCLMULQDQ can be found at:
*
- * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ * https://www.intel.com/content/dam/develop/external/us/en/documents/clmul-wp-rev-2-02-2014-04-20.pdf
*
* Copyright (c) 2009 Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
@@ -88,12 +88,12 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble)
RET
SYM_FUNC_END(__clmul_gf128mul_ble)
-/* void clmul_ghash_mul(char *dst, const u128 *shash) */
+/* void clmul_ghash_mul(char *dst, const le128 *shash) */
SYM_FUNC_START(clmul_ghash_mul)
FRAME_BEGIN
movups (%rdi), DATA
movups (%rsi), SHASH
- movaps .Lbswap_mask, BSWAP
+ movaps .Lbswap_mask(%rip), BSWAP
pshufb BSWAP, DATA
call __clmul_gf128mul_ble
pshufb BSWAP, DATA
@@ -104,13 +104,13 @@ SYM_FUNC_END(clmul_ghash_mul)
/*
* void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const u128 *shash);
+ * const le128 *shash);
*/
SYM_FUNC_START(clmul_ghash_update)
FRAME_BEGIN
cmp $16, %rdx
jb .Lupdate_just_ret # check length
- movaps .Lbswap_mask, BSWAP
+ movaps .Lbswap_mask(%rip), BSWAP
movups (%rdi), DATA
movups (%rcx), SHASH
pshufb BSWAP, DATA
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 1f1a95f3dd0c..700ecaee9a08 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,21 +19,22 @@
#include <crypto/internal/simd.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
+#include <asm/unaligned.h>
#define GHASH_BLOCK_SIZE 16
#define GHASH_DIGEST_SIZE 16
-void clmul_ghash_mul(char *dst, const u128 *shash);
+void clmul_ghash_mul(char *dst, const le128 *shash);
void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- const u128 *shash);
+ const le128 *shash);
struct ghash_async_ctx {
struct cryptd_ahash *cryptd_tfm;
};
struct ghash_ctx {
- u128 shash;
+ le128 shash;
};
struct ghash_desc_ctx {
@@ -54,22 +55,40 @@ static int ghash_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
- be128 *x = (be128 *)key;
u64 a, b;
if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- /* perform multiplication by 'x' in GF(2^128) */
- a = be64_to_cpu(x->a);
- b = be64_to_cpu(x->b);
-
- ctx->shash.a = (b << 1) | (a >> 63);
- ctx->shash.b = (a << 1) | (b >> 63);
-
+ /*
+ * GHASH maps bits to polynomial coefficients backwards, which makes it
+ * hard to implement. But it can be shown that the GHASH multiplication
+ *
+ * D * K (mod x^128 + x^7 + x^2 + x + 1)
+ *
+ * (where D is a data block and K is the key) is equivalent to:
+ *
+ * bitreflect(D) * bitreflect(K) * x^(-127)
+ * (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * So, the code below precomputes:
+ *
+ * bitreflect(K) * x^(-127) (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * ... but in Montgomery form (so that Montgomery multiplication can be
+ * used), i.e. with an extra x^128 factor, which means actually:
+ *
+ * bitreflect(K) * x (mod x^128 + x^127 + x^126 + x^121 + 1)
+ *
+ * The within-a-byte part of bitreflect() cancels out GHASH's built-in
+ * reflection, and thus bitreflect() is actually a byteswap.
+ */
+ a = get_unaligned_be64(key);
+ b = get_unaligned_be64(key + 8);
+ ctx->shash.a = cpu_to_le64((a << 1) | (b >> 63));
+ ctx->shash.b = cpu_to_le64((b << 1) | (a >> 63));
if (a >> 63)
- ctx->shash.b ^= ((u64)0xc2) << 56;
-
+ ctx->shash.a ^= cpu_to_le64((u64)0xc2 << 56);
return 0;
}
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
index 6a0b15e7196a..ef73a3ab8726 100644
--- a/arch/x86/crypto/nh-avx2-x86_64.S
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define PASS0_SUMS %ymm0
#define PASS1_SUMS %ymm1
@@ -65,11 +66,11 @@
/*
* void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- * u8 hash[NH_HASH_BYTES])
+ * __le64 hash[NH_NUM_PASSES])
*
* It's guaranteed that message_len % 16 == 0.
*/
-SYM_FUNC_START(nh_avx2)
+SYM_TYPED_FUNC_START(nh_avx2)
vmovdqu 0x00(KEY), K0
vmovdqu 0x10(KEY), K1
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
index 34c567bbcb4f..75fb994b6d17 100644
--- a/arch/x86/crypto/nh-sse2-x86_64.S
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -8,6 +8,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define PASS0_SUMS %xmm0
#define PASS1_SUMS %xmm1
@@ -67,11 +68,11 @@
/*
* void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- * u8 hash[NH_HASH_BYTES])
+ * __le64 hash[NH_NUM_PASSES])
*
* It's guaranteed that message_len % 16 == 0.
*/
-SYM_FUNC_START(nh_sse2)
+SYM_TYPED_FUNC_START(nh_sse2)
movdqu 0x00(KEY), K0
movdqu 0x10(KEY), K1
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
index 8ea5ab0f1ca7..46b036204ed9 100644
--- a/arch/x86/crypto/nhpoly1305-avx2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -14,14 +14,7 @@
#include <asm/simd.h>
asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
- __le64 hash[NH_NUM_PASSES])
-{
- nh_avx2(key, message, message_len, (u8 *)hash);
-}
+ __le64 hash[NH_NUM_PASSES]);
static int nhpoly1305_avx2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
@@ -33,7 +26,7 @@ static int nhpoly1305_avx2_update(struct shash_desc *desc,
unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_avx2);
kernel_fpu_end();
src += n;
srclen -= n;
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
index 2b353d42ed13..4a4970d75107 100644
--- a/arch/x86/crypto/nhpoly1305-sse2-glue.c
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -14,14 +14,7 @@
#include <asm/simd.h>
asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
- __le64 hash[NH_NUM_PASSES])
-{
- nh_sse2(key, message, message_len, (u8 *)hash);
-}
+ __le64 hash[NH_NUM_PASSES]);
static int nhpoly1305_sse2_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
@@ -33,7 +26,7 @@ static int nhpoly1305_sse2_update(struct shash_desc *desc,
unsigned int n = min_t(unsigned int, srclen, SZ_4K);
kernel_fpu_begin();
- crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+ crypto_nhpoly1305_update_helper(desc, src, n, nh_sse2);
kernel_fpu_end();
src += n;
srclen -= n;
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
index 2077ce7a5647..b9abcd79c1f4 100644
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -108,7 +108,6 @@ if (!$kernel) {
sub declare_function() {
my ($name, $align, $nargs) = @_;
if($kernel) {
- $code .= ".align $align\n";
$code .= "SYM_FUNC_START($name)\n";
$code .= ".L$name:\n";
} else {
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 82f2313f512b..97e283621851 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -550,7 +550,6 @@
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
/* input:
* %rdi: ctx, CTX
@@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx)
RET;
SYM_FUNC_END(__serpent_enc_blk8_avx)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index 8ea34c9b9316..6d60c50593a9 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -550,7 +550,6 @@
#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
/* input:
* %rdi: ctx, CTX
@@ -604,7 +603,6 @@ SYM_FUNC_START_LOCAL(__serpent_enc_blk16)
RET;
SYM_FUNC_END(__serpent_enc_blk16)
-.align 8
SYM_FUNC_START_LOCAL(__serpent_dec_blk16)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index a96b2fd26dab..4b49bdc95265 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -485,18 +485,18 @@
xchg WK_BUF, PRECALC_BUF
.align 32
-_loop:
+.L_loop:
/*
* code loops through more than one block
* we use K_BASE value as a signal of a last block,
* it is set below by: cmovae BUFFER_PTR, K_BASE
*/
test BLOCKS_CTR, BLOCKS_CTR
- jnz _begin
+ jnz .L_begin
.align 32
- jmp _end
+ jmp .L_end
.align 32
-_begin:
+.L_begin:
/*
* Do first block
@@ -508,9 +508,6 @@ _begin:
.set j, j+2
.endr
- jmp _loop0
-_loop0:
-
/*
* rounds:
* 10,12,14,16,18
@@ -545,7 +542,7 @@ _loop0:
UPDATE_HASH 16(HASH_PTR), E
test BLOCKS_CTR, BLOCKS_CTR
- jz _loop
+ jz .L_loop
mov TB, B
@@ -562,8 +559,6 @@ _loop0:
.set j, j+2
.endr
- jmp _loop1
-_loop1:
/*
* rounds
* 20+80,22+80,24+80,26+80,28+80
@@ -574,9 +569,6 @@ _loop1:
.set j, j+2
.endr
- jmp _loop2
-_loop2:
-
/*
* rounds
* 40+80,42+80,44+80,46+80,48+80
@@ -592,9 +584,6 @@ _loop2:
/* Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
- jmp _loop3
-_loop3:
-
/*
* rounds
* 60+80,62+80,64+80,66+80,68+80
@@ -623,10 +612,10 @@ _loop3:
xchg WK_BUF, PRECALC_BUF
- jmp _loop
+ jmp .L_loop
.align 32
- _end:
+.L_end:
.endm
/*
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 2f94ec0e763b..cade913d4882 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -54,6 +54,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
@@ -92,8 +93,7 @@
* numBlocks: Number of blocks to process
*/
.text
-.align 32
-SYM_FUNC_START(sha1_ni_transform)
+SYM_TYPED_FUNC_START(sha1_ni_transform)
push %rbp
mov %rsp, %rbp
sub $FRAME_SIZE, %rsp
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 263f916362e0..f54988c80eb4 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -25,6 +25,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define CTX %rdi // arg1
#define BUF %rsi // arg2
@@ -67,7 +68,7 @@
* param: function's name
*/
.macro SHA1_VECTOR_ASM name
- SYM_FUNC_START(\name)
+ SYM_TYPED_FUNC_START(\name)
push %rbx
push %r12
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 3baa1ec39097..53de72bdd851 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -346,8 +347,7 @@ a = TMP_
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_avx)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_avx)
pushq %rbx
pushq %r12
pushq %r13
@@ -360,7 +360,7 @@ SYM_FUNC_START(sha256_transform_avx)
and $~15, %rsp # align stack pointer
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS # pointer to end of data
mov NUM_BLKS, _INP_END(%rsp)
@@ -377,7 +377,7 @@ SYM_FUNC_START(sha256_transform_avx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
vmovdqa _SHUF_00BA(%rip), SHUF_00BA
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-loop0:
+.Lloop0:
lea K256(%rip), TBL
## byte swap first 16 dwords
@@ -391,7 +391,7 @@ loop0:
## schedule 48 input dwords, by doing 3 rounds of 16 each
mov $3, SRND
.align 16
-loop1:
+.Lloop1:
vpaddd (TBL), X0, XFER
vmovdqa XFER, _XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
@@ -410,10 +410,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
sub $1, SRND
- jne loop1
+ jne .Lloop1
mov $2, SRND
-loop2:
+.Lloop2:
vpaddd (TBL), X0, XFER
vmovdqa XFER, _XFER(%rsp)
DO_ROUND 0
@@ -433,7 +433,7 @@ loop2:
vmovdqa X3, X1
sub $1, SRND
- jne loop2
+ jne .Lloop2
addm (4*0)(CTX),a
addm (4*1)(CTX),b
@@ -447,9 +447,9 @@ loop2:
mov _INP(%rsp), INP
add $64, INP
cmp _INP_END(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
popq %rbp
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9bcdbc47b8b4..9918212faf91 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -49,6 +49,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -523,8 +524,7 @@ STACK_SIZE = _CTX + _CTX_SIZE
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_rorx)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_rorx)
pushq %rbx
pushq %r12
pushq %r13
@@ -538,12 +538,12 @@ SYM_FUNC_START(sha256_transform_rorx)
and $-32, %rsp # align rsp to 32 byte boundary
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
mov NUM_BLKS, _INP_END(%rsp)
cmp NUM_BLKS, INP
- je only_one_block
+ je .Lonly_one_block
## load initial digest
mov (CTX), a
@@ -561,7 +561,7 @@ SYM_FUNC_START(sha256_transform_rorx)
mov CTX, _CTX(%rsp)
-loop0:
+.Lloop0:
## Load first 16 dwords from two blocks
VMOVDQ 0*32(INP),XTMP0
VMOVDQ 1*32(INP),XTMP1
@@ -580,7 +580,7 @@ loop0:
vperm2i128 $0x20, XTMP3, XTMP1, X2
vperm2i128 $0x31, XTMP3, XTMP1, X3
-last_block_enter:
+.Llast_block_enter:
add $64, INP
mov INP, _INP(%rsp)
@@ -588,34 +588,40 @@ last_block_enter:
xor SRND, SRND
.align 16
-loop1:
- vpaddd K256+0*32(SRND), X0, XFER
+.Lloop1:
+ leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
- vpaddd K256+1*32(SRND), X0, XFER
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
- vpaddd K256+2*32(SRND), X0, XFER
+ leaq K256+2*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
- vpaddd K256+3*32(SRND), X0, XFER
+ leaq K256+3*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
add $4*32, SRND
cmp $3*4*32, SRND
- jb loop1
+ jb .Lloop1
-loop2:
+.Lloop2:
## Do last 16 rounds with no scheduling
- vpaddd K256+0*32(SRND), X0, XFER
+ leaq K256+0*32(%rip), INP
+ vpaddd (INP, SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
- vpaddd K256+1*32(SRND), X1, XFER
+ leaq K256+1*32(%rip), INP
+ vpaddd (INP, SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND
@@ -624,7 +630,7 @@ loop2:
vmovdqa X3, X1
cmp $4*4*32, SRND
- jb loop2
+ jb .Lloop2
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
@@ -639,17 +645,17 @@ loop2:
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
- ja done_hash
+ ja .Ldone_hash
#### Do second block using previously scheduled results
xor SRND, SRND
.align 16
-loop3:
+.Lloop3:
DO_4ROUNDS _XFER + 0*32 + 16
DO_4ROUNDS _XFER + 1*32 + 16
add $2*32, SRND
cmp $4*4*32, SRND
- jb loop3
+ jb .Lloop3
mov _CTX(%rsp), CTX
mov _INP(%rsp), INP
@@ -665,10 +671,10 @@ loop3:
addm (4*7)(CTX),h
cmp _INP_END(%rsp), INP
- jb loop0
- ja done_hash
+ jb .Lloop0
+ ja .Ldone_hash
-do_last_block:
+.Ldo_last_block:
VMOVDQ 0*16(INP),XWORD0
VMOVDQ 1*16(INP),XWORD1
VMOVDQ 2*16(INP),XWORD2
@@ -679,9 +685,9 @@ do_last_block:
vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
- jmp last_block_enter
+ jmp .Llast_block_enter
-only_one_block:
+.Lonly_one_block:
## load initial digest
mov (4*0)(CTX),a
@@ -698,9 +704,9 @@ only_one_block:
vmovdqa _SHUF_DC00(%rip), SHUF_DC00
mov CTX, _CTX(%rsp)
- jmp do_last_block
+ jmp .Ldo_last_block
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
pop %rbp
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index c4a5db612c32..93264ee44543 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -47,6 +47,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
## assume buffers not aligned
#define MOVDQ movdqu
@@ -355,8 +356,7 @@ a = TMP_
## arg 3 : Num blocks
########################################################################
.text
-SYM_FUNC_START(sha256_transform_ssse3)
-.align 32
+SYM_TYPED_FUNC_START(sha256_transform_ssse3)
pushq %rbx
pushq %r12
pushq %r13
@@ -369,7 +369,7 @@ SYM_FUNC_START(sha256_transform_ssse3)
and $~15, %rsp
shl $6, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS
mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
@@ -387,7 +387,7 @@ SYM_FUNC_START(sha256_transform_ssse3)
movdqa _SHUF_00BA(%rip), SHUF_00BA
movdqa _SHUF_DC00(%rip), SHUF_DC00
-loop0:
+.Lloop0:
lea K256(%rip), TBL
## byte swap first 16 dwords
@@ -401,7 +401,7 @@ loop0:
## schedule 48 input dwords, by doing 3 rounds of 16 each
mov $3, SRND
.align 16
-loop1:
+.Lloop1:
movdqa (TBL), XFER
paddd X0, XFER
movdqa XFER, _XFER(%rsp)
@@ -424,10 +424,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
sub $1, SRND
- jne loop1
+ jne .Lloop1
mov $2, SRND
-loop2:
+.Lloop2:
paddd (TBL), X0
movdqa X0, _XFER(%rsp)
DO_ROUND 0
@@ -446,7 +446,7 @@ loop2:
movdqa X3, X1
sub $1, SRND
- jne loop2
+ jne .Lloop2
addm (4*0)(CTX),a
addm (4*1)(CTX),b
@@ -460,9 +460,9 @@ loop2:
mov _INP(%rsp), INP
add $64, INP
cmp _INP_END(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
mov %rbp, %rsp
popq %rbp
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
index 94d50dd27cb5..537b6dcd7ed8 100644
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -54,6 +54,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#define DIGEST_PTR %rdi /* 1st arg */
#define DATA_PTR %rsi /* 2nd arg */
@@ -96,8 +97,7 @@
*/
.text
-.align 32
-SYM_FUNC_START(sha256_ni_transform)
+SYM_TYPED_FUNC_START(sha256_ni_transform)
shl $6, NUM_BLKS /* convert to bytes */
jz .Ldone_hash
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 1fefe6dd3a9e..d902b8ea0721 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -273,9 +274,9 @@ frame_size = frame_WK + WK_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks
########################################################################
-SYM_FUNC_START(sha512_transform_avx)
+SYM_TYPED_FUNC_START(sha512_transform_avx)
test msglen, msglen
- je nowork
+ je .Lnowork
# Save GPRs
push %rbx
@@ -290,7 +291,7 @@ SYM_FUNC_START(sha512_transform_avx)
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
-updateblock:
+.Lupdateblock:
# Load state variables
mov DIGEST(0), a_64
@@ -347,7 +348,7 @@ updateblock:
# Advance to next message block
add $16*8, msg
dec msglen
- jnz updateblock
+ jnz .Lupdateblock
# Restore Stack Pointer
mov %rbp, %rsp
@@ -360,7 +361,7 @@ updateblock:
pop %r12
pop %rbx
-nowork:
+.Lnowork:
RET
SYM_FUNC_END(sha512_transform_avx)
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 5cdaab7d6901..f08496cd6870 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -50,6 +50,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -565,7 +566,7 @@ frame_size = frame_CTX + CTX_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks
########################################################################
-SYM_FUNC_START(sha512_transform_rorx)
+SYM_TYPED_FUNC_START(sha512_transform_rorx)
# Save GPRs
push %rbx
push %r12
@@ -580,7 +581,7 @@ SYM_FUNC_START(sha512_transform_rorx)
and $~(0x20 - 1), %rsp
shl $7, NUM_BLKS # convert to bytes
- jz done_hash
+ jz .Ldone_hash
add INP, NUM_BLKS # pointer to end of data
mov NUM_BLKS, frame_INPEND(%rsp)
@@ -599,7 +600,7 @@ SYM_FUNC_START(sha512_transform_rorx)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
-loop0:
+.Lloop0:
lea K512(%rip), TBL
## byte swap first 16 dwords
@@ -614,7 +615,7 @@ loop0:
movq $4, frame_SRND(%rsp)
.align 16
-loop1:
+.Lloop1:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
FOUR_ROUNDS_AND_SCHED
@@ -633,10 +634,10 @@ loop1:
FOUR_ROUNDS_AND_SCHED
subq $1, frame_SRND(%rsp)
- jne loop1
+ jne .Lloop1
movq $2, frame_SRND(%rsp)
-loop2:
+.Lloop2:
vpaddq (TBL), Y_0, XFER
vmovdqa XFER, frame_XFER(%rsp)
DO_4ROUNDS
@@ -649,7 +650,7 @@ loop2:
vmovdqa Y_3, Y_1
subq $1, frame_SRND(%rsp)
- jne loop2
+ jne .Lloop2
mov frame_CTX(%rsp), CTX2
addm 8*0(CTX2), a
@@ -664,9 +665,9 @@ loop2:
mov frame_INP(%rsp), INP
add $128, INP
cmp frame_INPEND(%rsp), INP
- jne loop0
+ jne .Lloop0
-done_hash:
+.Ldone_hash:
# Restore Stack Pointer
mov %rbp, %rsp
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index b84c22e06c5f..65be30156816 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -48,6 +48,7 @@
########################################################################
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.text
@@ -274,10 +275,10 @@ frame_size = frame_WK + WK_SIZE
# of SHA512 message blocks.
# "blocks" is the message length in SHA512 blocks.
########################################################################
-SYM_FUNC_START(sha512_transform_ssse3)
+SYM_TYPED_FUNC_START(sha512_transform_ssse3)
test msglen, msglen
- je nowork
+ je .Lnowork
# Save GPRs
push %rbx
@@ -292,7 +293,7 @@ SYM_FUNC_START(sha512_transform_ssse3)
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
-updateblock:
+.Lupdateblock:
# Load state variables
mov DIGEST(0), a_64
@@ -349,7 +350,7 @@ updateblock:
# Advance to next message block
add $16*8, msg
dec msglen
- jnz updateblock
+ jnz .Lupdateblock
# Restore Stack Pointer
mov %rbp, %rsp
@@ -362,7 +363,7 @@ updateblock:
pop %r12
pop %rbx
-nowork:
+.Lnowork:
RET
SYM_FUNC_END(sha512_transform_ssse3)
diff --git a/arch/x86/crypto/sm3-avx-asm_64.S b/arch/x86/crypto/sm3-avx-asm_64.S
index b12b9efb5ec5..503bab450a91 100644
--- a/arch/x86/crypto/sm3-avx-asm_64.S
+++ b/arch/x86/crypto/sm3-avx-asm_64.S
@@ -12,6 +12,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
/* Context structure */
@@ -327,8 +328,7 @@
* void sm3_transform_avx(struct sm3_state *state,
* const u8 *data, int nblocks);
*/
-.align 16
-SYM_FUNC_START(sm3_transform_avx)
+SYM_TYPED_FUNC_START(sm3_transform_avx)
/* input:
* %rdi: ctx, CTX
* %rsi: data (64*nblks bytes)
diff --git a/arch/x86/crypto/sm4-aesni-avx-asm_64.S b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
index 4767ab61ff48..e2668d2fe6ce 100644
--- a/arch/x86/crypto/sm4-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx-asm_64.S
@@ -14,6 +14,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define rRIP (%rip)
@@ -139,13 +140,11 @@
.text
-.align 16
/*
* void sm4_aesni_avx_crypt4(const u32 *rk, u8 *dst,
* const u8 *src, int nblocks)
*/
-.align 8
SYM_FUNC_START(sm4_aesni_avx_crypt4)
/* input:
* %rdi: round key array, CTX
@@ -249,7 +248,6 @@ SYM_FUNC_START(sm4_aesni_avx_crypt4)
RET;
SYM_FUNC_END(sm4_aesni_avx_crypt4)
-.align 8
SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)
/* input:
* %rdi: round key array, CTX
@@ -363,7 +361,6 @@ SYM_FUNC_END(__sm4_crypt_blk8)
* void sm4_aesni_avx_crypt8(const u32 *rk, u8 *dst,
* const u8 *src, int nblocks)
*/
-.align 8
SYM_FUNC_START(sm4_aesni_avx_crypt8)
/* input:
* %rdi: round key array, CTX
@@ -419,8 +416,7 @@ SYM_FUNC_END(sm4_aesni_avx_crypt8)
* void sm4_aesni_avx_ctr_enc_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
@@ -494,8 +490,7 @@ SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)
* void sm4_aesni_avx_cbc_dec_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
@@ -544,8 +539,7 @@ SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
* void sm4_aesni_avx_cfb_dec_blk8(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
+SYM_TYPED_FUNC_START(sm4_aesni_avx_cfb_dec_blk8)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (8 blocks)
diff --git a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
index 4732fe8bb65b..98ede9459287 100644
--- a/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/sm4-aesni-avx2-asm_64.S
@@ -14,6 +14,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/frame.h>
#define rRIP (%rip)
@@ -153,9 +154,6 @@
.long 0xdeadbeef, 0xdeadbeef, 0xdeadbeef
.text
-.align 16
-
-.align 8
SYM_FUNC_START_LOCAL(__sm4_crypt_blk16)
/* input:
* %rdi: round key array, CTX
@@ -281,8 +279,7 @@ SYM_FUNC_END(__sm4_crypt_blk16)
* void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
@@ -394,8 +391,7 @@ SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16)
* void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
@@ -448,8 +444,7 @@ SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
* void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
* const u8 *src, u8 *iv)
*/
-.align 8
-SYM_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
+SYM_TYPED_FUNC_START(sm4_aesni_avx2_cfb_dec_blk16)
/* input:
* %rdi: round key array, CTX
* %rsi: dst (16 blocks)
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 31f9b2ec3857..12fde271cd3f 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -228,7 +228,6 @@
vpxor x2, wkey, x2; \
vpxor x3, wkey, x3;
-.align 8
SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
/* input:
* %rdi: ctx, CTX
@@ -270,7 +269,6 @@ SYM_FUNC_START_LOCAL(__twofish_enc_blk8)
RET;
SYM_FUNC_END(__twofish_enc_blk8)
-.align 8
SYM_FUNC_START_LOCAL(__twofish_dec_blk8)
/* input:
* %rdi: ctx, CTX
diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c
index f9c4adc27404..0614beece279 100644
--- a/arch/x86/crypto/twofish_glue.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -38,8 +38,8 @@
* Third Edition.
*/
+#include <crypto/algapi.h>
#include <crypto/twofish.h>
-#include <linux/crypto.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/types.h>
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e309e7156038..91397f58ac30 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1181,7 +1181,7 @@ SYM_CODE_START(asm_exc_nmi)
* is using the thread stack right now, so it's safe for us to use it.
*/
movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
+ movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esp
call exc_nmi
movl %ebx, %esp
@@ -1243,7 +1243,7 @@ SYM_CODE_START(rewind_stack_and_make_dead)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esi
+ movl PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %esi
leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp
call make_task_dead
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9953d966d124..f31e286c2977 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -8,7 +8,7 @@
*
* entry.S contains the system-call and fault low-level handling routines.
*
- * Some of this is documented in Documentation/x86/entry_64.rst
+ * Some of this is documented in Documentation/arch/x86/entry_64.rst
*
* A note on terminology:
* - iret frame: Architecture defined interrupt frame from SS to RIP
@@ -92,7 +92,7 @@ SYM_CODE_START(entry_SYSCALL_64)
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
@@ -205,7 +205,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -252,7 +252,7 @@ SYM_FUNC_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx
- movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
+ movq %rbx, PER_CPU_VAR(fixed_percpu_data) + FIXED_stack_canary
#endif
/*
@@ -284,9 +284,11 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
- UNWIND_HINT_EMPTY
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(ret_from_fork)
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
+ CALL_DEPTH_ACCOUNT
movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
@@ -301,7 +303,7 @@ SYM_CODE_START(ret_from_fork)
1:
/* kernel thread */
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
movq %r12, %rdi
CALL_NOSPEC rbx
/*
@@ -326,11 +328,12 @@ SYM_CODE_END(ret_from_fork)
#endif
.endm
-SYM_CODE_START_LOCAL(xen_error_entry)
+SYM_CODE_START(xen_error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(xen_error_entry)
@@ -382,7 +385,14 @@ SYM_CODE_END(xen_error_entry)
*/
.macro idtentry vector asmsym cfunc has_error_code:req
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+
+ .if \vector == X86_TRAP_BP
+ /* #BP advances %rip to the next instruction */
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
+ .else
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
+ .endif
+
ENDBR
ASM_CLAC
cld
@@ -451,7 +461,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_mce_db vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -508,7 +518,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -572,7 +582,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_df vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS offset=8
+ UNWIND_HINT_IRET_ENTRY offset=8
ENDBR
ASM_CLAC
cld
@@ -600,13 +610,13 @@ SYM_CODE_END(\asmsym)
* shared between 32 and 64 bit and emit the __irqentry_text_* markers
* so the stacktrace boundary checks work.
*/
- .align 16
+ __ALIGN
.globl __irqentry_text_start
__irqentry_text_start:
#include <asm/idtentry.h>
- .align 16
+ __ALIGN
.globl __irqentry_text_end
__irqentry_text_end:
ANNOTATE_NOENDBR
@@ -633,7 +643,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -779,7 +789,7 @@ _ASM_NOKPROBE(common_interrupt_return)
/*
* Reload gs selector with exception handling
- * edi: new selector
+ * di: new selector
*
* Is in entry.text as it shouldn't be instrumented.
*/
@@ -828,7 +838,8 @@ EXPORT_SYMBOL(asm_load_gs_index)
*
* C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
+ __FUNC_ALIGN
+SYM_CODE_START_LOCAL_NOALIGN(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -856,8 +867,9 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
* We distinguish between categories by comparing each saved segment register
* with its current contents: any discrepancy means we in category 1.
*/
-SYM_CODE_START(xen_failsafe_callback)
- UNWIND_HINT_EMPTY
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(xen_failsafe_callback)
+ UNWIND_HINT_UNDEFINED
ENDBR
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
@@ -903,7 +915,8 @@ SYM_CODE_END(xen_failsafe_callback)
* R14 - old CR3
* R15 - old SPEC_CTRL
*/
-SYM_CODE_START_LOCAL(paranoid_entry)
+SYM_CODE_START(paranoid_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
@@ -972,7 +985,7 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* CR3 above, keep the old value in a callee saved register.
*/
IBRS_ENTER save_reg=%r15
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
RET
SYM_CODE_END(paranoid_entry)
@@ -1014,7 +1027,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
- * exceptions go through error_exit instead.
+ * exceptions go through error_return instead.
*/
RESTORE_CR3 scratch_reg=%rax save_reg=%r14
@@ -1038,7 +1051,8 @@ SYM_CODE_END(paranoid_exit)
/*
* Switch GS and CR3 if needed.
*/
-SYM_CODE_START_LOCAL(error_entry)
+SYM_CODE_START(error_entry)
+ ANNOTATE_NOENDBR
UNWIND_HINT_FUNC
PUSH_AND_CLEAR_REGS save_ret=1
@@ -1056,14 +1070,11 @@ SYM_CODE_START_LOCAL(error_entry)
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
-.Lerror_entry_from_usermode_after_swapgs:
-
/* Put us onto the real thread stack. */
- call sync_regs
- RET
+ jmp sync_regs
/*
* There are two places in the kernel that can potentially fault with
@@ -1094,8 +1105,9 @@ SYM_CODE_START_LOCAL(error_entry)
*/
.Lerror_entry_done_lfence:
FENCE_SWAPGS_KERNEL_ENTRY
+ CALL_DEPTH_ACCOUNT
leaq 8(%rsp), %rax /* return pt_regs pointer */
- ANNOTATE_UNRET_END
+ VALIDATE_UNRET_END
RET
.Lbstep_iret:
@@ -1112,7 +1124,7 @@ SYM_CODE_START_LOCAL(error_entry)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
IBRS_ENTER
- UNTRAIN_RET
+ UNTRAIN_RET_FROM_CALL
/*
* Pretend that the exception came from user mode: set up pt_regs
@@ -1121,7 +1133,7 @@ SYM_CODE_START_LOCAL(error_entry)
leaq 8(%rsp), %rdi /* arg0 = pt_regs pointer */
call fixup_bad_iret
mov %rax, %rdi
- jmp .Lerror_entry_from_usermode_after_swapgs
+ jmp sync_regs
SYM_CODE_END(error_entry)
SYM_CODE_START_LOCAL(error_return)
@@ -1141,7 +1153,7 @@ SYM_CODE_END(error_return)
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
/*
@@ -1206,7 +1218,7 @@ SYM_CODE_START(asm_exc_nmi)
FENCE_SWAPGS_USER_ENTRY
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8
pushq 5*8(%rdx) /* pt_regs->ss */
pushq 4*8(%rdx) /* pt_regs->rsp */
@@ -1508,7 +1520,7 @@ SYM_CODE_END(asm_exc_nmi)
* MSRs to fully disable 32-bit SYSCALL.
*/
SYM_CODE_START(ignore_sysret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
sysretl
@@ -1516,12 +1528,13 @@ SYM_CODE_END(ignore_sysret)
#endif
.pushsection .text, "ax"
-SYM_CODE_START(rewind_stack_and_make_dead)
+ __FUNC_ALIGN
+SYM_CODE_START_NOALIGN(rewind_stack_and_make_dead)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rax
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rax
leaq -PTREGS_SIZE(%rax), %rsp
UNWIND_HINT_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 4dd19819053a..70150298f8bd 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -58,10 +58,10 @@ SYM_CODE_START(entry_SYSENTER_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
popq %rax
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq $__USER32_DS /* pt_regs->ss */
+ pushq $__USER_DS /* pt_regs->ss */
pushq $0 /* pt_regs->sp = 0 (placeholder) */
/*
@@ -128,7 +128,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
- ANNOTATE_NOENDBR // is_sysenter_singlestep
SYM_CODE_END(entry_SYSENTER_compat)
/*
@@ -191,13 +190,13 @@ SYM_CODE_START(entry_SYSCALL_compat)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Switch to the kernel stack */
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_compat_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
- pushq $__USER32_DS /* pt_regs->ss */
+ pushq $__USER_DS /* pt_regs->ss */
pushq %r8 /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER32_CS /* pt_regs->cs */
@@ -332,7 +331,7 @@ SYM_CODE_START(entry_INT80_compat)
ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
movq %rsp, %rax
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
pushq 5*8(%rax) /* regs->ss */
pushq 4*8(%rax) /* regs->rsp */
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index f38b07d2768b..5e37f41e5f14 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -11,7 +11,7 @@
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func
-SYM_FUNC_START_NOALIGN(\name)
+SYM_FUNC_START(\name)
pushq %rbp
movq %rsp, %rbp
@@ -36,7 +36,7 @@ SYM_FUNC_END(\name)
EXPORT_SYMBOL(preempt_schedule_thunk)
EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
-SYM_CODE_START_LOCAL_NOALIGN(__thunk_restore)
+SYM_CODE_START_LOCAL(__thunk_restore)
popq %r11
popq %r10
popq %r9
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 3e88b9df8c8f..6a1821bd7d5e 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,10 +3,7 @@
# Building vDSO images for x86.
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
-ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Sanitizer runtimes are unavailable and cannot be linked here.
@@ -29,15 +26,16 @@ VDSO32-$(CONFIG_IA32_EMULATION) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
vobjs32-y := vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
-vobjs32-y += vdso32/vclock_gettime.o
+vobjs32-y += vdso32/vclock_gettime.o vdso32/vgetcpu.o
vobjs-$(CONFIG_X86_SGX) += vsgx.o
# files to link into kernel
-obj-y += vma.o extable.o
-KASAN_SANITIZE_vma.o := y
-UBSAN_SANITIZE_vma.o := y
-KCSAN_SANITIZE_vma.o := y
-OBJECT_FILES_NON_STANDARD_vma.o := n
+obj-y += vma.o extable.o
+KASAN_SANITIZE_vma.o := y
+UBSAN_SANITIZE_vma.o := y
+KCSAN_SANITIZE_vma.o := y
+OBJECT_FILES_NON_STANDARD_vma.o := n
+OBJECT_FILES_NON_STANDARD_extable.o := n
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
@@ -94,7 +92,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(PADDING_CFLAGS) $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -103,6 +101,7 @@ $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
CFLAGS_REMOVE_vclock_gettime.o = -pg
CFLAGS_REMOVE_vdso32/vclock_gettime.o = -pg
CFLAGS_REMOVE_vgetcpu.o = -pg
+CFLAGS_REMOVE_vdso32/vgetcpu.o = -pg
CFLAGS_REMOVE_vsgx.o = -pg
#
@@ -157,6 +156,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(PADDING_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 4bf48462fca7..e8c60ae7a7c8 100644
--- a/arch/x86/entry/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
@@ -27,7 +27,9 @@ VERSION {
__vdso_time;
clock_getres;
__vdso_clock_getres;
+#ifdef CONFIG_X86_SGX
__vdso_sgx_enter_enclave;
+#endif
local: *;
};
}
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 5264daa8859f..67b3e37576a6 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -179,6 +179,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
+ fprintf(outfile, "#include <linux/init.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
fprintf(outfile, "#include <asm/vdso.h>\n");
fprintf(outfile, "\n");
@@ -218,5 +219,10 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
required_syms[i].name, (int64_t)syms[i]);
}
+ fprintf(outfile, "};\n\n");
+ fprintf(outfile, "static __init int init_%s(void) {\n", image_name);
+ fprintf(outfile, "\treturn init_vdso_image(&%s);\n", image_name);
fprintf(outfile, "};\n");
+ fprintf(outfile, "subsys_initcall(init_%s);\n", image_name);
+
}
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 43842fade8fa..f3b3cacbcbb0 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -51,17 +51,8 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-int __init sysenter_setup(void)
-{
- init_vdso_image(&vdso_image_32);
-
- return 0;
-}
-
#ifdef CONFIG_X86_64
-subsys_initcall(sysenter_setup);
-
#ifdef CONFIG_SYSCTL
/* Register vsyscall32 into the ABI table */
#include <linux/sysctl.h>
@@ -79,18 +70,9 @@ static struct ctl_table abi_table2[] = {
{}
};
-static struct ctl_table abi_root_table2[] = {
- {
- .procname = "abi",
- .mode = 0555,
- .child = abi_table2
- },
- {}
-};
-
static __init int ia32_binfmt_init(void)
{
- register_sysctl_table(abi_root_table2);
+ register_sysctl("abi", abi_table2);
return 0;
}
__initcall(ia32_binfmt_init);
diff --git a/arch/x86/entry/vdso/vdso32/fake_32bit_build.h b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
new file mode 100644
index 000000000000..db1b15f686e3
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/fake_32bit_build.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_X86_64
+
+/*
+ * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
+ * configuration
+ */
+#undef CONFIG_64BIT
+#undef CONFIG_X86_64
+#undef CONFIG_COMPAT
+#undef CONFIG_PGTABLE_LEVELS
+#undef CONFIG_ILLEGAL_POINTER_VALUE
+#undef CONFIG_SPARSEMEM_VMEMMAP
+#undef CONFIG_NR_CPUS
+#undef CONFIG_PARAVIRT_XXL
+
+#define CONFIG_X86_32 1
+#define CONFIG_PGTABLE_LEVELS 2
+#define CONFIG_PAGE_OFFSET 0
+#define CONFIG_ILLEGAL_POINTER_VALUE 0
+#define CONFIG_NR_CPUS 1
+
+#define BUILD_VDSO32_64
+
+#endif
diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 283ed9d00426..86981decfea8 100644
--- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
@@ -1,29 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
#define BUILD_VDSO32
-
-#ifdef CONFIG_X86_64
-
-/*
- * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel
- * configuration
- */
-#undef CONFIG_64BIT
-#undef CONFIG_X86_64
-#undef CONFIG_COMPAT
-#undef CONFIG_PGTABLE_LEVELS
-#undef CONFIG_ILLEGAL_POINTER_VALUE
-#undef CONFIG_SPARSEMEM_VMEMMAP
-#undef CONFIG_NR_CPUS
-#undef CONFIG_PARAVIRT_XXL
-
-#define CONFIG_X86_32 1
-#define CONFIG_PGTABLE_LEVELS 2
-#define CONFIG_PAGE_OFFSET 0
-#define CONFIG_ILLEGAL_POINTER_VALUE 0
-#define CONFIG_NR_CPUS 1
-
-#define BUILD_VDSO32_64
-
-#endif
-
+#include "fake_32bit_build.h"
#include "../vclock_gettime.c"
diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index c7720995ab1a..8a3be07006bb 100644
--- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
@@ -28,6 +28,7 @@ VERSION
__vdso_time;
__vdso_clock_getres;
__vdso_clock_gettime64;
+ __vdso_getcpu;
};
LINUX_2.5 {
diff --git a/arch/x86/entry/vdso/vdso32/vgetcpu.c b/arch/x86/entry/vdso/vdso32/vgetcpu.c
new file mode 100644
index 000000000000..3a9791f5e998
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/vgetcpu.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "fake_32bit_build.h"
+#include "../vgetcpu.c"
diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index b88a82bbc359..0a9007c24056 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -7,8 +7,7 @@
#include <linux/kernel.h>
#include <linux/getcpu.h>
-#include <linux/time.h>
-#include <asm/vgtod.h>
+#include <asm/segment.h>
notrace long
__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 311eae30e089..11a5c68d1218 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -44,13 +44,16 @@ unsigned int vclocks_used __read_mostly;
unsigned int __read_mostly vdso64_enabled = 1;
#endif
-void __init init_vdso_image(const struct vdso_image *image)
+int __init init_vdso_image(const struct vdso_image *image)
{
+ BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
BUG_ON(image->size % PAGE_SIZE != 0);
apply_alternatives((struct alt_instr *)(image->data + image->alt),
(struct alt_instr *)(image->data + image->alt +
image->alt_len));
+
+ return 0;
}
static const struct vm_special_mapping vvar_mapping;
@@ -98,24 +101,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
}
#ifdef CONFIG_TIME_NS
-static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
/*
* The vvar page layout depends on whether a task belongs to the root or
* non-root time namespace. Whenever a task changes its namespace, the VVAR
@@ -131,20 +116,13 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
- unsigned long size = vma->vm_end - vma->vm_start;
-
if (vma_is_special_mapping(vma, &vvar_mapping))
- zap_page_range(vma, vma->vm_start, size);
+ zap_vma_pages(vma);
}
mmap_read_unlock(mm);
return 0;
}
-#else
-static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- return NULL;
-}
#endif
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
@@ -210,11 +188,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
pgprot_decrypted(vma->vm_page_prot));
}
} else if (sym_offset == image->sym_hvclock_page) {
- struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+ pfn = hv_get_tsc_pfn();
- if (tsc_pg && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
- return vmf_insert_pfn(vma, vmf->address,
- virt_to_phys(tsc_pg) >> PAGE_SHIFT);
+ if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK))
+ return vmf_insert_pfn(vma, vmf->address, pfn);
} else if (sym_offset == image->sym_timens_page) {
struct page *timens_page = find_timens_vvar_page(vma);
@@ -327,7 +304,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
end -= len;
if (end > start) {
- offset = prandom_u32_max(((end - start) >> PAGE_SHIFT) + 1);
+ offset = get_random_u32_below(((end - start) >> PAGE_SHIFT) + 1);
addr = start + (offset << PAGE_SHIFT);
} else {
addr = start;
@@ -442,18 +419,4 @@ static __init int vdso_setup(char *s)
return 1;
}
__setup("vdso=", vdso_setup);
-
-static int __init init_vdso(void)
-{
- BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32);
-
- init_vdso_image(&vdso_image_64);
-
-#ifdef CONFIG_X86_X32_ABI
- init_vdso_image(&vdso_image_x32);
-#endif
-
- return 0;
-}
-subsys_initcall(init_vdso);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 4af81df133ee..e0ca8120aea8 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -317,7 +317,7 @@ static struct vm_area_struct gate_vma __ro_after_init = {
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
- if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL))
+ if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
return NULL;
#endif
if (vsyscall_mode == NONE)
@@ -391,7 +391,7 @@ void __init map_vsyscall(void)
}
if (vsyscall_mode == XONLY)
- gate_vma.vm_flags = VM_EXEC;
+ vm_flags_init(&gate_vma, VM_EXEC);
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index f1bff153d945..ed308719236c 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -41,18 +41,15 @@ static inline unsigned int brs_to(int idx)
return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
}
-static inline void set_debug_extn_cfg(u64 val)
+static __always_inline void set_debug_extn_cfg(u64 val)
{
/* bits[4:3] must always be set to 11b */
- wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3);
+ __wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32);
}
-static inline u64 get_debug_extn_cfg(void)
+static __always_inline u64 get_debug_extn_cfg(void)
{
- u64 val;
-
- rdmsrl(MSR_AMD_DBG_EXTN_CFG, val);
- return val;
+ return __rdmsr(MSR_AMD_DBG_EXTN_CFG);
}
static bool __init amd_brs_detect(void)
@@ -384,7 +381,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -405,7 +402,7 @@ void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
* called from ACPI processor_idle.c or acpi_pad.c
* with interrupts disabled
*/
-void perf_amd_brs_lopwr_cb(bool lopwr_in)
+void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
union amd_debug_extn_cfg cfg;
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 8b70237c33f7..bccea57dee81 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -861,8 +861,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
pmu_enabled = cpuc->enabled;
cpuc->enabled = 0;
- /* stop everything (includes BRS) */
- amd_pmu_disable_all();
+ amd_brs_disable_all();
/* Drain BRS is in use (could be inactive) */
if (cpuc->lbr_users)
@@ -873,7 +872,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
cpuc->enabled = pmu_enabled;
if (pmu_enabled)
- amd_pmu_enable_all(0);
+ amd_brs_enable_all();
return amd_pmu_adjust_nmi_window(handled);
}
@@ -924,20 +923,17 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
/* Event overflow */
handled++;
+ status &= ~mask;
perf_sample_data_init(&data, 0, hwc->last_period);
if (!x86_perf_event_set_period(event))
continue;
- if (has_branch_stack(event)) {
- data.br_stack = &cpuc->lbr_stack;
- data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
-
- status &= ~mask;
}
/*
@@ -1388,7 +1384,7 @@ static int __init amd_core_pmu_init(void)
* numbered counter following it.
*/
for (i = 0; i < x86_pmu.num_counters - 1; i += 2)
- even_ctr_mask |= 1 << i;
+ even_ctr_mask |= BIT_ULL(i);
pair_constraint = (struct event_constraint)
__EVENT_CONSTRAINT(0, even_ctr_mask, 0,
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 4cb710efbdd9..64582954b5f6 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -631,7 +631,7 @@ static const struct attribute_group *op_attr_update[] = {
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
@@ -655,7 +655,7 @@ static struct perf_ibs perf_ibs_fetch = {
static struct perf_ibs perf_ibs_op = {
.pmu = {
- .task_ctx_nr = perf_invalid_context,
+ .task_ctx_nr = perf_hw_context,
.event_init = perf_ibs_init,
.add = perf_ibs_add,
@@ -1110,8 +1110,7 @@ fail:
.data = ibs_data.data,
},
};
- data.raw = &raw;
- data.sample_flags |= PERF_SAMPLE_RAW;
+ perf_sample_save_raw_data(&data, &raw);
}
if (perf_ibs == &perf_ibs_op)
@@ -1122,10 +1121,8 @@ fail:
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- data.callchain = perf_callchain(event, iregs);
- data.sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(&data, event, iregs);
throttle = perf_event_overflow(event, &data, &regs);
out:
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
index 38a75216c12c..eb31f850841a 100644
--- a/arch/x86/events/amd/lbr.c
+++ b/arch/x86/events/amd/lbr.c
@@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
@@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index d568afc705d2..83f15fe411b3 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -553,6 +553,7 @@ static void uncore_clean_online(void)
hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) {
hlist_del(&uncore->node);
+ kfree(uncore->events);
kfree(uncore);
}
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b30b8bbcd1e2..9d248703cbdd 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
+DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
+
/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
@@ -1701,10 +1703,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
perf_sample_data_init(&data, 0, event->hw.last_period);
- if (has_branch_stack(event)) {
- data.br_stack = &cpuc->lbr_stack;
- data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -2031,6 +2031,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
+ static_call_update(x86_pmu_filter, x86_pmu.filter);
}
static void _x86_pmu_read(struct perf_event *event)
@@ -2052,23 +2053,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}
-/*
- * The generic code is not hybrid friendly. The hybrid_pmu->pmu
- * of the first registered PMU is unconditionally assigned to
- * each possible cpuctx->ctx.pmu.
- * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
- */
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
-{
- struct perf_cpu_context *cpuctx;
-
- if (!pmu->pmu_cpu_context)
- return;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->ctx.pmu = pmu;
-}
-
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -2175,13 +2159,9 @@ static int __init init_hw_perf_events(void)
if (err)
goto out2;
} else {
- u8 cpu_type = get_this_hybrid_cpu_type();
struct x86_hybrid_pmu *hybrid_pmu;
int i, j;
- if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
- cpu_type = x86_pmu.get_hybrid_cpu_type();
-
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
hybrid_pmu = &x86_pmu.hybrid_pmu[i];
@@ -2195,9 +2175,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;
-
- if (cpu_type == hybrid_pmu->cpu_type)
- x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
@@ -2646,15 +2623,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
- static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
+ static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}
-static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
+ static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}
void perf_check_microcode(void)
@@ -2689,12 +2666,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
-static int x86_pmu_filter_match(struct perf_event *event)
+static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
- if (x86_pmu.filter_match)
- return x86_pmu.filter_match(event);
+ bool ret = false;
- return 1;
+ static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
+
+ return ret;
}
static struct pmu pmu = {
@@ -2725,7 +2703,7 @@ static struct pmu pmu = {
.aux_output_match = x86_pmu_aux_output_match,
- .filter_match = x86_pmu_filter_match,
+ .filter = x86_pmu_filter,
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -2994,17 +2972,19 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
- if (!x86_pmu_initialized()) {
+ /* This API doesn't currently support enumerating hybrid PMUs. */
+ if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
+ !x86_pmu_initialized()) {
memset(cap, 0, sizeof(*cap));
return;
}
- cap->version = x86_pmu.version;
/*
- * KVM doesn't support the hybrid PMU yet.
- * Return the common value in global x86_pmu,
- * which available for all cores.
+ * Note, hybrid CPU models get tracked as having hybrid PMUs even when
+ * all E-cores are disabled via BIOS. When E-cores are disabled, the
+ * base PMU holds the correct number of counters for P-cores.
*/
+ cap->version = x86_pmu.version;
cap->num_counters_gp = x86_pmu.num_counters;
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
cap->bit_width_gp = x86_pmu.cntval_bits;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1b92bf05fd65..27f3a7b34bd5 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -349,6 +349,16 @@ static struct event_constraint intel_spr_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+static struct extra_reg intel_gnr_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ INTEL_UEVENT_EXTRA_REG(0x02c6, MSR_PEBS_FRONTEND, 0x9, FE),
+ INTEL_UEVENT_EXTRA_REG(0x03c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE),
+ INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE),
+ INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE),
+ EVENT_EXTRA_END
+};
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
@@ -2119,6 +2129,16 @@ static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_cmt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff3ffffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff3ffffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ INTEL_UEVENT_EXTRA_REG(0x0127, MSR_SNOOP_RSP_0, 0xffffffffffffffffull, SNOOP_0),
+ INTEL_UEVENT_EXTRA_REG(0x0227, MSR_SNOOP_RSP_1, 0xffffffffffffffffull, SNOOP_1),
+ EVENT_EXTRA_END
+};
+
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
@@ -3026,10 +3046,8 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
perf_sample_data_init(&data, 0, event->hw.last_period);
- if (has_branch_stack(event)) {
- data.br_stack = &cpuc->lbr_stack;
- data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(&data, event, &cpuc->lbr_stack);
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -4066,7 +4084,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
if (x86_pmu.intel_cap.pebs_baseline) {
arr[(*nr)++] = (struct perf_guest_switch_msr){
.msr = MSR_PEBS_DATA_CFG,
- .host = cpuc->pebs_data_cfg,
+ .host = cpuc->active_pebs_data_cfg,
.guest = kvm_pmu->pebs_data_cfg,
};
}
@@ -4182,6 +4200,12 @@ static int hsw_hw_config(struct perf_event *event)
static struct event_constraint counter0_constraint =
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1);
+static struct event_constraint counter1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x2);
+
+static struct event_constraint counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x3);
+
static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
@@ -4191,6 +4215,12 @@ static struct event_constraint fixed0_constraint =
static struct event_constraint fixed0_counter0_constraint =
INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL);
+static struct event_constraint fixed0_counter0_1_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000003ULL);
+
+static struct event_constraint counters_1_7_constraint =
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xfeULL);
+
static struct event_constraint *
hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
@@ -4322,6 +4352,78 @@ adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return &emptyconstraint;
}
+static struct event_constraint *
+cmt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * The :ppp indicates the Precise Distribution (PDist) facility, which
+ * is only supported on the GP counter 0 & 1 and Fixed counter 0.
+ * If a :ppp event which is not available on the above eligible counters,
+ * error out.
+ */
+ if (event->attr.precise_ip == 3) {
+ /* Force instruction:ppp on PMC0, 1 and Fixed counter 0 */
+ if (constraint_match(&fixed0_constraint, event->hw.config))
+ return &fixed0_counter0_1_constraint;
+
+ switch (c->idxmsk64 & 0x3ull) {
+ case 0x1:
+ return &counter0_constraint;
+ case 0x2:
+ return &counter1_constraint;
+ case 0x3:
+ return &counter0_1_constraint;
+ }
+ return &emptyconstraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+rwc_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ c = spr_get_event_constraints(cpuc, idx, event);
+
+ /* The Retire Latency is not supported by the fixed counter 0. */
+ if (event->attr.precise_ip &&
+ (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE) &&
+ constraint_match(&fixed0_constraint, event->hw.config)) {
+ /*
+ * The Instruction PDIR is only available
+ * on the fixed counter 0. Error out for this case.
+ */
+ if (event->attr.precise_ip == 3)
+ return &emptyconstraint;
+ return &counters_1_7_constraint;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
+mtl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return rwc_get_event_constraints(cpuc, idx, event);
+ if (pmu->cpu_type == hybrid_small)
+ return cmt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
static int adl_hw_config(struct perf_event *event)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
@@ -4494,6 +4596,25 @@ static void flip_smm_bit(void *data)
}
}
+static void intel_pmu_check_num_counters(int *num_counters,
+ int *num_counters_fixed,
+ u64 *intel_ctrl, u64 fixed_mask);
+
+static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
+{
+ unsigned int sub_bitmaps = cpuid_eax(ARCH_PERFMON_EXT_LEAF);
+ unsigned int eax, ebx, ecx, edx;
+
+ if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
+ &eax, &ebx, &ecx, &edx);
+ pmu->num_counters = fls(eax);
+ pmu->num_counters_fixed = fls(ebx);
+ intel_pmu_check_num_counters(&pmu->num_counters, &pmu->num_counters_fixed,
+ &pmu->intel_ctrl, ebx);
+ }
+}
+
static bool init_hybrid_pmu(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
@@ -4519,6 +4640,9 @@ static bool init_hybrid_pmu(int cpu)
if (!cpumask_empty(&pmu->supported_cpus))
goto end;
+ if (this_cpu_has(X86_FEATURE_ARCH_PERFMON_EXT))
+ update_pmu_cap(pmu);
+
if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
return false;
@@ -4536,8 +4660,6 @@ end:
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;
- x86_pmu_update_cpu_context(&pmu->pmu, cpu);
-
return true;
}
@@ -4671,17 +4793,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
+static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
- intel_pmu_pebs_sched_task(ctx, sched_in);
- intel_pmu_lbr_sched_task(ctx, sched_in);
+ intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
+ intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}
-static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
- intel_pmu_lbr_swap_task_ctx(prev, next);
+ intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}
static int intel_pmu_check_period(struct perf_event *event, u64 value)
@@ -4705,12 +4827,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
-static int intel_pmu_filter_match(struct perf_event *event)
+static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
- struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
- unsigned int cpu = smp_processor_id();
+ struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);
- return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+ *ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -5359,6 +5480,15 @@ pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
}
static umode_t
+mem_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &event_attr_mem_ld_aux.attr.attr)
+ return x86_pmu.flags & PMU_FL_MEM_LOADS_AUX ? attr->mode : 0;
+
+ return pebs_is_visible(kobj, attr, i);
+}
+
+static umode_t
lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.lbr_nr ? attr->mode : 0;
@@ -5385,7 +5515,7 @@ static struct attribute_group group_events_td = {
static struct attribute_group group_events_mem = {
.name = "events",
- .is_visible = pebs_is_visible,
+ .is_visible = mem_is_visible,
};
static struct attribute_group group_events_tsx = {
@@ -5466,6 +5596,12 @@ static struct attribute *adl_hybrid_mem_attrs[] = {
NULL,
};
+static struct attribute *mtl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ NULL
+};
+
EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
@@ -5493,20 +5629,40 @@ FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small);
FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small);
FORMAT_ATTR_HYBRID(frontend, hybrid_big);
+#define ADL_HYBRID_RTM_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(in_tx), \
+ FORMAT_HYBRID_PTR(in_tx_cp)
+
+#define ADL_HYBRID_FORMAT_ATTR \
+ FORMAT_HYBRID_PTR(offcore_rsp), \
+ FORMAT_HYBRID_PTR(ldlat), \
+ FORMAT_HYBRID_PTR(frontend)
+
static struct attribute *adl_hybrid_extra_attr_rtm[] = {
- FORMAT_HYBRID_PTR(in_tx),
- FORMAT_HYBRID_PTR(in_tx_cp),
- FORMAT_HYBRID_PTR(offcore_rsp),
- FORMAT_HYBRID_PTR(ldlat),
- FORMAT_HYBRID_PTR(frontend),
- NULL,
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
};
static struct attribute *adl_hybrid_extra_attr[] = {
- FORMAT_HYBRID_PTR(offcore_rsp),
- FORMAT_HYBRID_PTR(ldlat),
- FORMAT_HYBRID_PTR(frontend),
- NULL,
+ ADL_HYBRID_FORMAT_ATTR,
+ NULL
+};
+
+PMU_FORMAT_ATTR_SHOW(snoop_rsp, "config1:0-63");
+FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small);
+
+static struct attribute *mtl_hybrid_extra_attr_rtm[] = {
+ ADL_HYBRID_RTM_FORMAT_ATTR,
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
+};
+
+static struct attribute *mtl_hybrid_extra_attr[] = {
+ ADL_HYBRID_FORMAT_ATTR,
+ FORMAT_HYBRID_PTR(snoop_rsp),
+ NULL
};
static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
@@ -5728,6 +5884,12 @@ static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
}
}
+static __always_inline bool is_mtl(u8 x86_model)
+{
+ return (x86_model == INTEL_FAM6_METEORLAKE) ||
+ (x86_model == INTEL_FAM6_METEORLAKE_L);
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
@@ -6342,6 +6504,12 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ case INTEL_FAM6_EMERALDRAPIDS_X:
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ x86_pmu.extra_regs = intel_spr_extra_regs;
+ fallthrough;
+ case INTEL_FAM6_GRANITERAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_D:
pmem = true;
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -6349,15 +6517,16 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_spr_event_constraints;
x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
- x86_pmu.extra_regs = intel_spr_extra_regs;
+ if (!x86_pmu.extra_regs)
+ x86_pmu.extra_regs = intel_gnr_extra_regs;
x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_ept = 1;
x86_pmu.pebs_aliases = NULL;
x86_pmu.pebs_prec_dist = true;
x86_pmu.pebs_block = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
- x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = spr_get_event_constraints;
@@ -6384,6 +6553,8 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
+ case INTEL_FAM6_METEORLAKE:
+ case INTEL_FAM6_METEORLAKE_L:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
@@ -6403,9 +6574,7 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
- x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.lbr_pt_coexist = true;
- intel_pmu_pebs_data_source_adl();
x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.num_topdown_events = 8;
static_call_update(intel_pmu_update_topdown_event,
@@ -6413,7 +6582,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);
- x86_pmu.filter_match = intel_pmu_filter_match;
+ x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;
@@ -6492,8 +6661,23 @@ __init int intel_pmu_init(void)
pmu->event_constraints = intel_slm_event_constraints;
pmu->pebs_constraints = intel_grt_pebs_event_constraints;
pmu->extra_regs = intel_grt_extra_regs;
- pr_cont("Alderlake Hybrid events, ");
- name = "alderlake_hybrid";
+ if (is_mtl(boot_cpu_data.x86_model)) {
+ x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].extra_regs = intel_gnr_extra_regs;
+ x86_pmu.pebs_latency_data = mtl_latency_data_small;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr;
+ mem_attr = mtl_hybrid_mem_attrs;
+ intel_pmu_pebs_data_source_mtl();
+ x86_pmu.get_event_constraints = mtl_get_event_constraints;
+ pmu->extra_regs = intel_cmt_extra_regs;
+ pr_cont("Meteorlake Hybrid events, ");
+ name = "meteorlake_hybrid";
+ } else {
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ intel_pmu_pebs_data_source_adl();
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ }
break;
default:
@@ -6608,6 +6792,9 @@ __init int intel_pmu_init(void)
if (is_hybrid())
intel_pmu_check_hybrid_pmus((u64)fixed_mask);
+ if (x86_pmu.intel_cap.pebs_timing_info)
+ x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
+
intel_aux_output_init();
return 0;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index a2834bc93149..835862c548cc 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -41,6 +41,7 @@
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
* Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL
+ * MTL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -51,50 +52,50 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL,ADL,RPL
+ * ICL,TGL,RKL,ADL,RPL,MTL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
* KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL,
- * RPL,SPR
+ * RPL,SPR,MTL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
* GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
- * ADL,RPL
+ * ADL,RPL,MTL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX,
- * TGL,TNT,RKL,ADL,RPL,SPR
+ * TGL,TNT,RKL,ADL,RPL,SPR,MTL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL,ADL,RPL
+ * KBL,CML,ICL,TGL,RKL,ADL,RPL,MTL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL,RPL
+ * ADL,RPL,MTL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
* Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
- * ADL,RPL
+ * ADL,RPL,MTL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL,ADL,RPL
+ * TNT,RKL,ADL,RPL,MTL
* Scope: Package (physical package)
*
*/
@@ -676,6 +677,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
@@ -686,6 +690,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 446d2833efa7..df88576d6b2a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2,12 +2,14 @@
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/slab.h>
+#include <linux/sched/clock.h>
#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
#include <asm/io.h>
+#include <asm/timer.h>
#include "../perf_event.h"
@@ -53,6 +55,13 @@ union intel_x86_pebs_dse {
unsigned int st_lat_locked:1;
unsigned int ld_reserved3:26;
};
+ struct {
+ unsigned int mtl_dse:5;
+ unsigned int mtl_locked:1;
+ unsigned int mtl_stlb_miss:1;
+ unsigned int mtl_fwd_blk:1;
+ unsigned int ld_reserved4:24;
+ };
};
@@ -135,6 +144,29 @@ void __init intel_pmu_pebs_data_source_adl(void)
__intel_pmu_pebs_data_source_grt(data_source);
}
+static void __init intel_pmu_pebs_data_source_cmt(u64 *data_source)
+{
+ data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
+ data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ data_source[0x0a] = OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, NONE);
+ data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ data_source[0x0c] = OP_LH | LEVEL(RAM) | REM | P(SNOOPX, FWD);
+ data_source[0x0d] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_mtl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_skl(false, data_source);
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ intel_pmu_pebs_data_source_cmt(data_source);
+}
+
static u64 precise_store_data(u64 status)
{
union intel_x86_pebs_dse dse;
@@ -219,24 +251,19 @@ static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
}
/* Retrieve the latency data for e-core of ADL */
-u64 adl_latency_data_small(struct perf_event *event, u64 status)
+static u64 __adl_latency_data_small(struct perf_event *event, u64 status,
+ u8 dse, bool tlb, bool lock, bool blk)
{
- union intel_x86_pebs_dse dse;
u64 val;
WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
- dse.val = status;
+ dse &= PERF_PEBS_DATA_SOURCE_MASK;
+ val = hybrid_var(event->pmu, pebs_data_source)[dse];
- val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
+ pebs_set_tlb_lock(&val, tlb, lock);
- /*
- * For the atom core on ADL,
- * bit 4: lock, bit 5: TLB access.
- */
- pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
-
- if (dse.ld_data_blk)
+ if (blk)
val |= P(BLK, DATA);
else
val |= P(BLK, NA);
@@ -244,6 +271,29 @@ u64 adl_latency_data_small(struct perf_event *event, u64 status)
return val;
}
+u64 adl_latency_data_small(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __adl_latency_data_small(event, status, dse.ld_dse,
+ dse.ld_locked, dse.ld_stlb_miss,
+ dse.ld_data_blk);
+}
+
+/* Retrieve the latency data for e-core of MTL */
+u64 mtl_latency_data_small(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+
+ dse.val = status;
+
+ return __adl_latency_data_small(event, status, dse.mtl_dse,
+ dse.mtl_stlb_miss, dse.mtl_locked,
+ dse.mtl_fwd_blk);
+}
+
static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
@@ -759,7 +809,8 @@ int intel_pmu_drain_bts_buffer(void)
* the sample.
*/
rcu_read_lock();
- perf_prepare_sample(&header, &data, event, &regs);
+ perf_prepare_sample(&data, event, &regs);
+ perf_prepare_header(&header, &data, event, &regs);
if (perf_output_begin(&handle, &data, event,
header.size * (top - base - skip)))
@@ -1069,7 +1120,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1177,13 +1228,15 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
- struct pmu *pmu = event->ctx->pmu;
+ struct pmu *pmu = event->pmu;
+
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
* that does not hurt:
*/
- bool update = cpuc->n_pebs == 1;
+ if (cpuc->n_pebs == 1)
+ cpuc->pebs_data_cfg = PEBS_UPDATE_DS_SW;
if (needed_cb != pebs_needs_sched_cb(cpuc)) {
if (!needed_cb)
@@ -1191,7 +1244,7 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
else
perf_sched_cb_dec(pmu);
- update = true;
+ cpuc->pebs_data_cfg |= PEBS_UPDATE_DS_SW;
}
/*
@@ -1201,24 +1254,13 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
if (x86_pmu.intel_cap.pebs_baseline && add) {
u64 pebs_data_cfg;
- /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */
- if (cpuc->n_pebs == 1) {
- cpuc->pebs_data_cfg = 0;
- cpuc->pebs_record_size = sizeof(struct pebs_basic);
- }
-
pebs_data_cfg = pebs_update_adaptive_cfg(event);
-
- /* Update pebs_record_size if new event requires more data. */
- if (pebs_data_cfg & ~cpuc->pebs_data_cfg) {
- cpuc->pebs_data_cfg |= pebs_data_cfg;
- adaptive_pebs_record_size_update();
- update = true;
- }
+ /*
+ * Be sure to update the thresholds when we change the record.
+ */
+ if (pebs_data_cfg & ~cpuc->pebs_data_cfg)
+ cpuc->pebs_data_cfg |= pebs_data_cfg | PEBS_UPDATE_DS_SW;
}
-
- if (update)
- pebs_update_threshold(cpuc);
}
void intel_pmu_pebs_add(struct perf_event *event)
@@ -1275,9 +1317,17 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event)
wrmsrl(base + idx, value);
}
+static inline void intel_pmu_drain_large_pebs(struct cpu_hw_events *cpuc)
+{
+ if (cpuc->n_pebs == cpuc->n_large_pebs &&
+ cpuc->n_pebs != cpuc->n_pebs_via_pt)
+ intel_pmu_drain_pebs_buffer();
+}
+
void intel_pmu_pebs_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 pebs_data_cfg = cpuc->pebs_data_cfg & ~PEBS_UPDATE_DS_SW;
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
unsigned int idx = hwc->idx;
@@ -1293,11 +1343,22 @@ void intel_pmu_pebs_enable(struct perf_event *event)
if (x86_pmu.intel_cap.pebs_baseline) {
hwc->config |= ICL_EVENTSEL_ADAPTIVE;
- if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) {
- wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg);
- cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg;
+ if (pebs_data_cfg != cpuc->active_pebs_data_cfg) {
+ /*
+ * drain_pebs() assumes uniform record size;
+ * hence we need to drain when changing said
+ * size.
+ */
+ intel_pmu_drain_large_pebs(cpuc);
+ adaptive_pebs_record_size_update();
+ wrmsrl(MSR_PEBS_DATA_CFG, pebs_data_cfg);
+ cpuc->active_pebs_data_cfg = pebs_data_cfg;
}
}
+ if (cpuc->pebs_data_cfg & PEBS_UPDATE_DS_SW) {
+ cpuc->pebs_data_cfg = pebs_data_cfg;
+ pebs_update_threshold(cpuc);
+ }
if (idx >= INTEL_PMC_IDX_FIXED) {
if (x86_pmu.intel_cap.pebs_format < 5)
@@ -1340,9 +1401,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- if (cpuc->n_pebs == cpuc->n_large_pebs &&
- cpuc->n_pebs != cpuc->n_pebs_via_pt)
- intel_pmu_drain_pebs_buffer();
+ intel_pmu_drain_large_pebs(cpuc);
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -1519,6 +1578,27 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
return val;
}
+static void setup_pebs_time(struct perf_event *event,
+ struct perf_sample_data *data,
+ u64 tsc)
+{
+ /* Converting to a user-defined clock is not supported yet. */
+ if (event->attr.use_clockid != 0)
+ return;
+
+ /*
+ * Doesn't support the conversion when the TSC is unstable.
+ * The TSC unstable case is a corner case and very unlikely to
+ * happen. If it happens, the TSC in a PEBS record will be
+ * dropped and fall back to perf_event_clock().
+ */
+ if (!using_native_sched_clock() || !sched_clock_stable())
+ return;
+
+ data->time = native_sched_clock_from_tsc(tsc) + __sched_clock_offset;
+ data->sample_flags |= PERF_SAMPLE_TIME;
+}
+
#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \
PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_DATA_PAGE_SIZE)
@@ -1569,10 +1649,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- data->callchain = perf_callchain(event, iregs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, iregs);
/*
* We use the interrupt regs as a base because the PEBS record does not
@@ -1668,16 +1746,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
*
* We can only do this for the default trace clock.
*/
- if (x86_pmu.intel_cap.pebs_format >= 3 &&
- event->attr.use_clockid == 0) {
- data->time = native_sched_clock_from_tsc(pebs->tsc);
- data->sample_flags |= PERF_SAMPLE_TIME;
- }
+ if (x86_pmu.intel_cap.pebs_format >= 3)
+ setup_pebs_time(event, data, pebs->tsc);
- if (has_branch_stack(event)) {
- data->br_stack = &cpuc->lbr_stack;
- data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
- }
+ if (has_branch_stack(event))
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1705,6 +1778,7 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
#define PEBS_LATENCY_MASK 0xffff
#define PEBS_CACHE_LATENCY_OFFSET 32
+#define PEBS_RETIRE_LATENCY_OFFSET 32
/*
* With adaptive PEBS the layout depends on what fields are configured.
@@ -1735,10 +1809,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
- if (event->attr.use_clockid == 0) {
- data->time = native_sched_clock_from_tsc(basic->tsc);
- data->sample_flags |= PERF_SAMPLE_TIME;
- }
+ setup_pebs_time(event, data, basic->tsc);
/*
* We must however always use iregs for the unwinder to stay sane; the
@@ -1746,16 +1817,17 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- data->callchain = perf_callchain(event, iregs);
- data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
- }
+ if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ perf_sample_save_callchain(data, event, iregs);
*regs = *iregs;
/* The ip in basic is EventingIP */
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
+ if ((sample_type & PERF_SAMPLE_WEIGHT_STRUCT) && (x86_pmu.flags & PMU_FL_RETIRE_LATENCY))
+ data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+
/*
* The record for MEMINFO is in front of GP
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
@@ -1835,8 +1907,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
- data->br_stack = &cpuc->lbr_stack;
- data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ perf_sample_save_brstack(data, event, &cpuc->lbr_stack);
}
}
@@ -2303,8 +2374,10 @@ void __init intel_ds_init(void)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
break;
- case 4:
case 5:
+ x86_pmu.pebs_ept = 1;
+ fallthrough;
+ case 4:
x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
if (x86_pmu.intel_cap.pebs_baseline) {
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8259d725054d..c3b0d15a9841 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next)
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;
- swap(prev->task_ctx_data, next->task_ctx_data);
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
/*
- * Architecture specific synchronization makes sense in
- * case both prev->task_ctx_data and next->task_ctx_data
+ * Architecture specific synchronization makes sense in case
+ * both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/
- prev_ctx_data = next->task_ctx_data;
- next_ctx_data = prev->task_ctx_data;
+ prev_ctx_data = next_epc->task_ctx_data;
+ next_ctx_data = prev_epc->task_ctx_data;
if (!prev_ctx_data || !next_ctx_data)
return;
@@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
@@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
- task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
@@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
+ if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;
/*
* Request pmu::sched_task() callback, which will fire inside the
@@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
@@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;
if (branch_user_callstack(cpuc->br_sel) &&
- event->ctx->task_ctx_data)
- task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
+ event->pmu_ctx->task_ctx_data)
+ task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;
if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
@@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
static inline bool vlbr_exclude_host(void)
@@ -1603,19 +1603,13 @@ clear_arch_lbr:
* x86_perf_get_lbr - get the LBR records information
*
* @lbr: the caller's memory to store the LBR records information
- *
- * Returns: 0 indicates the LBR info has been successfully obtained
*/
-int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
- int lbr_fmt = x86_pmu.intel_cap.lbr_format;
-
lbr->nr = x86_pmu.lbr_nr;
lbr->from = x86_pmu.lbr_from;
lbr->to = x86_pmu.lbr_to;
- lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
-
- return 0;
+ lbr->info = x86_pmu.lbr_info;
}
EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 03bbcc2fa2ff..35936188db01 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -24,7 +24,7 @@ struct p4_event_bind {
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int escr_emask; /* valid ESCR EventMask bits */
unsigned int shared; /* event is shared across threads */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
+ signed char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
};
struct p4_pebs_bind {
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 82ef87e9a897..42a55794004a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1263,6 +1263,15 @@ static int pt_buffer_try_single(struct pt_buffer *buf, int nr_pages)
if (1 << order != nr_pages)
goto out;
+ /*
+ * Some processors cannot always support single range for more than
+ * 4KB - refer errata TGL052, ADL037 and RPL017. Future processors might
+ * also be affected, so for now rather than trying to keep track of
+ * which ones, just disable it for all.
+ */
+ if (nr_pages > 1)
+ goto out;
+
buf->single = true;
buf->nr_pages = nr_pages;
ret = 0;
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 6f1ccc57a692..bc226603ef3e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -65,6 +65,21 @@ int uncore_die_to_segment(int die)
return bus ? pci_domain_nr(bus) : -EINVAL;
}
+int uncore_device_to_die(struct pci_dev *dev)
+{
+ int node = pcibus_to_node(dev->bus);
+ int cpu;
+
+ for_each_cpu(cpu, cpumask_of_pcibus(dev->bus)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node)
+ return c->logical_die_id;
+ }
+
+ return -1;
+}
+
static void uncore_free_pcibus_map(void)
{
struct pci2phy_map *map, *tmp;
@@ -842,6 +857,12 @@ static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
+static inline int uncore_get_box_id(struct intel_uncore_type *type,
+ struct intel_uncore_pmu *pmu)
+{
+ return type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx;
+}
+
void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
{
struct intel_uncore_type *type = pmu->type;
@@ -850,7 +871,7 @@ void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu)
sprintf(pmu_name, "uncore_type_%u", type->type_id);
else {
sprintf(pmu_name, "uncore_type_%u_%d",
- type->type_id, type->box_ids[pmu->pmu_idx]);
+ type->type_id, uncore_get_box_id(type, pmu));
}
}
@@ -877,7 +898,7 @@ static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
* Use the box ID from the discovery table if applicable.
*/
sprintf(pmu->name, "uncore_%s_%d", type->name,
- type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx);
+ uncore_get_box_id(type, pmu));
}
}
@@ -1674,7 +1695,10 @@ struct intel_uncore_init_fun {
void (*cpu_init)(void);
int (*pci_init)(void);
void (*mmio_init)(void);
+ /* Discovery table is required */
bool use_discovery;
+ /* The units in the discovery table should be ignored. */
+ int *uncore_units_ignore;
};
static const struct intel_uncore_init_fun nhm_uncore_init __initconst = {
@@ -1765,6 +1789,11 @@ static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
.mmio_init = adl_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
+ .cpu_init = mtl_uncore_cpu_init,
+ .mmio_init = adl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1782,6 +1811,7 @@ static const struct intel_uncore_init_fun spr_uncore_init __initconst = {
.pci_init = spr_uncore_pci_init,
.mmio_init = spr_uncore_mmio_init,
.use_discovery = true,
+ .uncore_units_ignore = spr_uncore_units_ignore,
};
static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
@@ -1832,7 +1862,10 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
};
@@ -1852,7 +1885,7 @@ static int __init intel_uncore_init(void)
id = x86_match_cpu(intel_uncore_match);
if (!id) {
- if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+ if (!uncore_no_discover && intel_uncore_has_discovery_tables(NULL))
uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
else
return -ENODEV;
@@ -1860,7 +1893,8 @@ static int __init intel_uncore_init(void)
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_no_discover && uncore_init->use_discovery)
return -ENODEV;
- if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables())
+ if (uncore_init->use_discovery &&
+ !intel_uncore_has_discovery_tables(uncore_init->uncore_units_ignore))
return -ENODEV;
}
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 2adeaf4de4df..c30fb5bb1222 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -2,6 +2,7 @@
#include <linux/slab.h>
#include <linux/pci.h>
#include <asm/apicdef.h>
+#include <asm/intel-family.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/perf_event.h>
@@ -33,6 +34,8 @@
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+#define UNCORE_IGNORE_END -1
+
struct pci_extra_dev {
struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
};
@@ -88,12 +91,12 @@ struct intel_uncore_type {
* to identify which platform component each PMON block of that type is
* supposed to monitor.
*/
- struct intel_uncore_topology *topology;
+ struct intel_uncore_topology **topology;
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
int (*get_topology)(struct intel_uncore_type *type);
- int (*set_mapping)(struct intel_uncore_type *type);
+ void (*set_mapping)(struct intel_uncore_type *type);
void (*cleanup_mapping)(struct intel_uncore_type *type);
};
@@ -178,11 +181,26 @@ struct freerunning_counters {
unsigned *box_offsets;
};
-struct intel_uncore_topology {
- u64 configuration;
+struct uncore_iio_topology {
+ int pci_bus_no;
int segment;
};
+struct uncore_upi_topology {
+ int die_to;
+ int pmu_idx_to;
+ int enabled;
+};
+
+struct intel_uncore_topology {
+ int pmu_idx;
+ union {
+ void *untyped;
+ struct uncore_iio_topology *iio;
+ struct uncore_upi_topology *upi;
+ };
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
@@ -192,6 +210,7 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
int uncore_pcibus_to_dieid(struct pci_bus *bus);
int uncore_die_to_segment(int die);
+int uncore_device_to_die(struct pci_dev *dev);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
@@ -573,6 +592,7 @@ extern raw_spinlock_t pci2phy_map_lock;
extern struct list_head pci2phy_map_head;
extern struct pci_extra_dev *uncore_extra_pci_dev;
extern struct event_constraint uncore_constraint_empty;
+extern int spr_uncore_units_ignore[];
/* uncore_snb.c */
int snb_uncore_pci_init(void);
@@ -586,6 +606,7 @@ void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
+void mtl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 5fd72d4b8bbb..cb488e41807c 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -33,7 +33,7 @@ static int logical_die_id;
static int get_device_die_id(struct pci_dev *dev)
{
- int cpu, node = pcibus_to_node(dev->bus);
+ int node = pcibus_to_node(dev->bus);
/*
* If the NUMA info is not available, assume that the logical die id is
@@ -43,19 +43,7 @@ static int get_device_die_id(struct pci_dev *dev)
if (node < 0)
return logical_die_id++;
- for_each_cpu(cpu, cpumask_of_node(node)) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- if (c->initialized && cpu_to_node(cpu) == node)
- return c->logical_die_id;
- }
-
- /*
- * All CPUs of a node may be offlined. For this case,
- * the PCI and MMIO type of uncore blocks which are
- * enumerated by the device will be unavailable.
- */
- return -1;
+ return uncore_device_to_die(dev);
}
#define __node_2_type(cur) \
@@ -140,13 +128,21 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
unsigned int *box_offset, *ids;
int i;
- if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+ if (!unit->ctl || !unit->ctl_offset || !unit->ctr_offset) {
+ pr_info("Invalid address is detected for uncore type %d box %d, "
+ "Disable the uncore unit.\n",
+ unit->box_type, unit->box_id);
return;
+ }
if (parsed) {
type = search_uncore_discovery_type(unit->box_type);
- if (WARN_ON_ONCE(!type))
+ if (!type) {
+ pr_info("A spurious uncore type %d is detected, "
+ "Disable the uncore type.\n",
+ unit->box_type);
return;
+ }
/* Store the first box of each die */
if (!type->box_ctrl_die[die])
type->box_ctrl_die[die] = unit->ctl;
@@ -181,8 +177,12 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
ids[i] = type->ids[i];
box_offset[i] = type->box_offset[i];
- if (WARN_ON_ONCE(unit->box_id == ids[i]))
+ if (unit->box_id == ids[i]) {
+ pr_info("Duplicate uncore type %d box ID %d is detected, "
+ "Drop the duplicate uncore unit.\n",
+ unit->box_type, unit->box_id);
goto free_ids;
+ }
}
ids[i] = unit->box_id;
box_offset[i] = unit->ctl - type->box_ctrl;
@@ -202,8 +202,25 @@ free_box_offset:
}
+static bool
+uncore_ignore_unit(struct uncore_unit_discovery *unit, int *ignore)
+{
+ int i;
+
+ if (!ignore)
+ return false;
+
+ for (i = 0; ignore[i] != UNCORE_IGNORE_END ; i++) {
+ if (unit->box_type == ignore[i])
+ return true;
+ }
+
+ return false;
+}
+
static int parse_discovery_table(struct pci_dev *dev, int die,
- u32 bar_offset, bool *parsed)
+ u32 bar_offset, bool *parsed,
+ int *ignore)
{
struct uncore_global_discovery global;
struct uncore_unit_discovery unit;
@@ -258,6 +275,9 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
if (unit.access_type >= UNCORE_ACCESS_MAX)
continue;
+ if (uncore_ignore_unit(&unit, ignore))
+ continue;
+
uncore_insert_box_info(&unit, die, *parsed);
}
@@ -266,7 +286,7 @@ static int parse_discovery_table(struct pci_dev *dev, int die,
return 0;
}
-bool intel_uncore_has_discovery_tables(void)
+bool intel_uncore_has_discovery_tables(int *ignore)
{
u32 device, val, entry_id, bar_offset;
int die, dvsec = 0, ret = true;
@@ -302,7 +322,7 @@ bool intel_uncore_has_discovery_tables(void)
if (die < 0)
continue;
- parse_discovery_table(dev, die, bar_offset, &parsed);
+ parse_discovery_table(dev, die, bar_offset, &parsed, ignore);
}
}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index f4439357779a..6ee80ad3423e 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -21,9 +21,15 @@
/* Global discovery table size */
#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20
-#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7)
-#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff)
-#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET 28
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS_OFFSET 20
+#define UNCORE_DISCOVERY_PCI_BUS(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_BUS_OFFSET) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN_OFFSET 12
+#define UNCORE_DISCOVERY_PCI_DEVFN(data) \
+ ((data >> UNCORE_DISCOVERY_PCI_DEVFN_OFFSET) & 0xff)
#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff)
@@ -122,7 +128,7 @@ struct intel_uncore_discovery_type {
unsigned int *box_offset; /* Box offset */
};
-bool intel_uncore_has_discovery_tables(void);
+bool intel_uncore_has_discovery_tables(int *ignore);
void intel_uncore_clear_discovery_tables(void);
void intel_uncore_generic_uncore_cpu_init(void);
int intel_uncore_generic_uncore_pci_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 1ef4f7861e2e..7fd4334e12a1 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -109,6 +109,19 @@
#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728
#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729
#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A
+#define PCI_DEVICE_ID_INTEL_MTL_1_IMC 0x7d00
+#define PCI_DEVICE_ID_INTEL_MTL_2_IMC 0x7d01
+#define PCI_DEVICE_ID_INTEL_MTL_3_IMC 0x7d02
+#define PCI_DEVICE_ID_INTEL_MTL_4_IMC 0x7d05
+#define PCI_DEVICE_ID_INTEL_MTL_5_IMC 0x7d10
+#define PCI_DEVICE_ID_INTEL_MTL_6_IMC 0x7d14
+#define PCI_DEVICE_ID_INTEL_MTL_7_IMC 0x7d15
+#define PCI_DEVICE_ID_INTEL_MTL_8_IMC 0x7d16
+#define PCI_DEVICE_ID_INTEL_MTL_9_IMC 0x7d21
+#define PCI_DEVICE_ID_INTEL_MTL_10_IMC 0x7d22
+#define PCI_DEVICE_ID_INTEL_MTL_11_IMC 0x7d23
+#define PCI_DEVICE_ID_INTEL_MTL_12_IMC 0x7d24
+#define PCI_DEVICE_ID_INTEL_MTL_13_IMC 0x7d28
#define IMC_UNCORE_DEV(a) \
@@ -205,6 +218,32 @@
#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0
#define ADL_UNC_ARB_MSR_OFFSET 0x8
+/* MTL Cbo register */
+#define MTL_UNC_CBO_0_PER_CTR0 0x2448
+#define MTL_UNC_CBO_0_PERFEVTSEL0 0x2442
+
+/* MTL HAC_ARB register */
+#define MTL_UNC_HAC_ARB_CTR 0x2018
+#define MTL_UNC_HAC_ARB_CTRL 0x2012
+
+/* MTL ARB register */
+#define MTL_UNC_ARB_CTR 0x2418
+#define MTL_UNC_ARB_CTRL 0x2412
+
+/* MTL cNCU register */
+#define MTL_UNC_CNCU_FIXED_CTR 0x2408
+#define MTL_UNC_CNCU_FIXED_CTRL 0x2402
+#define MTL_UNC_CNCU_BOX_CTL 0x240e
+
+/* MTL sNCU register */
+#define MTL_UNC_SNCU_FIXED_CTR 0x2008
+#define MTL_UNC_SNCU_FIXED_CTRL 0x2002
+#define MTL_UNC_SNCU_BOX_CTL 0x200e
+
+/* MTL HAC_CBO register */
+#define MTL_UNC_HBO_CTR 0x2048
+#define MTL_UNC_HBO_CTRL 0x2042
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
@@ -598,6 +637,115 @@ void adl_uncore_cpu_init(void)
uncore_msr_uncores = adl_msr_uncores;
}
+static struct intel_uncore_type mtl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = MTL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_arb = {
+ .name = "hac_arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HAC_ARB_CTR,
+ .event_ctl = MTL_UNC_HAC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_ARB_CTR,
+ .event_ctl = MTL_UNC_ARB_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type mtl_uncore_hac_cbox = {
+ .name = "hac_cbox",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = MTL_UNC_HBO_CTR,
+ .event_ctl = MTL_UNC_HBO_CTRL,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .ops = &icl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static void mtl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static struct intel_uncore_ops mtl_uncore_msr_ops = {
+ .init_box = mtl_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type mtl_uncore_cncu = {
+ .name = "cncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_CNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_CNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_CNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type mtl_uncore_sncu = {
+ .name = "sncu",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .box_ctl = MTL_UNC_SNCU_BOX_CTL,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = MTL_UNC_SNCU_FIXED_CTR,
+ .fixed_ctl = MTL_UNC_SNCU_FIXED_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &mtl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *mtl_msr_uncores[] = {
+ &mtl_uncore_cbox,
+ &mtl_uncore_hac_arb,
+ &mtl_uncore_arb,
+ &mtl_uncore_hac_cbox,
+ &mtl_uncore_cncu,
+ &mtl_uncore_sncu,
+ NULL
+};
+
+void mtl_uncore_cpu_init(void)
+{
+ mtl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = mtl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -1264,6 +1412,19 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
IMC_UNCORE_DEV(RPL_23),
IMC_UNCORE_DEV(RPL_24),
IMC_UNCORE_DEV(RPL_25),
+ IMC_UNCORE_DEV(MTL_1),
+ IMC_UNCORE_DEV(MTL_2),
+ IMC_UNCORE_DEV(MTL_3),
+ IMC_UNCORE_DEV(MTL_4),
+ IMC_UNCORE_DEV(MTL_5),
+ IMC_UNCORE_DEV(MTL_6),
+ IMC_UNCORE_DEV(MTL_7),
+ IMC_UNCORE_DEV(MTL_8),
+ IMC_UNCORE_DEV(MTL_9),
+ IMC_UNCORE_DEV(MTL_10),
+ IMC_UNCORE_DEV(MTL_11),
+ IMC_UNCORE_DEV(MTL_12),
+ IMC_UNCORE_DEV(MTL_13),
{ /* end: all zeroes */ }
};
@@ -1338,6 +1499,7 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
/* MCHBAR is disabled */
if (!(mch_bar & BIT(0))) {
pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
+ pci_dev_put(pdev);
return;
}
mch_bar &= ~BIT(0);
@@ -1352,6 +1514,8 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
+ pci_dev_put(pdev);
}
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index ed869443efb2..d49e90dc04a4 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -445,6 +445,7 @@
#define ICX_UPI_PCI_PMON_CTR0 0x320
#define ICX_UPI_PCI_PMON_BOX_CTL 0x318
#define ICX_UPI_CTL_UMASK_EXT 0xffffff
+#define ICX_UBOX_DID 0x3450
/* ICX M3UPI*/
#define ICX_M3UPI_PCI_PMON_CTL0 0xd8
@@ -457,6 +458,7 @@
/* SPR */
#define SPR_RAW_EVENT_MASK_EXT 0xffffff
+#define SPR_UBOX_DID 0x3250
/* SPR CHA */
#define SPR_CHA_PMON_CTL_TID_EN (1 << 16)
@@ -1372,6 +1374,28 @@ static struct pci_driver snbep_uncore_pci_driver = {
#define NODE_ID_MASK 0x7
+/* Each three bits from 0 to 23 of GIDNIDMAP register correspond Node ID. */
+#define GIDNIDMAP(config, id) (((config) >> (3 * (id))) & 0x7)
+
+static int upi_nodeid_groupid(struct pci_dev *ubox_dev, int nodeid_loc, int idmap_loc,
+ int *nodeid, int *groupid)
+{
+ int ret;
+
+ /* get the Node ID of the local register */
+ ret = pci_read_config_dword(ubox_dev, nodeid_loc, nodeid);
+ if (ret)
+ goto err;
+
+ *nodeid = *nodeid & NODE_ID_MASK;
+ /* get the Node ID mapping */
+ ret = pci_read_config_dword(ubox_dev, idmap_loc, groupid);
+ if (ret)
+ goto err;
+err:
+ return ret;
+}
+
/*
* build pci bus to socket mapping
*/
@@ -1397,13 +1421,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* the topology.
*/
if (nr_node_ids <= 8) {
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
- if (err)
- break;
- nodeid = config & NODE_ID_MASK;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
+ err = upi_nodeid_groupid(ubox_dev, nodeid_loc, idmap_loc,
+ &nodeid, &config);
if (err)
break;
@@ -1421,7 +1440,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
* to a particular node.
*/
for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ if (nodeid == GIDNIDMAP(config, i)) {
if (topology_max_die_per_package() > 1)
die_id = i;
else
@@ -1434,9 +1453,6 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
}
raw_spin_unlock(&pci2phy_map_lock);
} else {
- int node = pcibus_to_node(ubox_dev->bus);
- int cpu;
-
segment = pci_domain_nr(ubox_dev->bus);
raw_spin_lock(&pci2phy_map_lock);
map = __find_pci2phy_map(segment);
@@ -1446,15 +1462,8 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
break;
}
- die_id = -1;
- for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) {
- struct cpuinfo_x86 *c = &cpu_data(cpu);
+ map->pbus_to_dieid[bus] = die_id = uncore_device_to_die(ubox_dev);
- if (c->initialized && cpu_to_node(cpu) == node) {
- map->pbus_to_dieid[bus] = die_id = c->logical_die_id;
- break;
- }
- }
raw_spin_unlock(&pci2phy_map_lock);
if (WARN_ON_ONCE(die_id == -1)) {
@@ -2891,6 +2900,7 @@ static bool hswep_has_limit_sbox(unsigned int device)
return false;
pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ pci_dev_put(dev);
if (!hswep_get_chop(capid4))
return true;
@@ -3699,10 +3709,16 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
.read_counter = uncore_msr_read_counter,
};
-static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+static struct intel_uncore_topology *pmu_topology(struct intel_uncore_pmu *pmu, int die)
{
- return pmu->type->topology[die].configuration >>
- (pmu->pmu_idx * BUS_NUM_STRIDE);
+ int idx;
+
+ for (idx = 0; idx < pmu->type->num_boxes; idx++) {
+ if (pmu->type->topology[die][idx].pmu_idx == pmu->pmu_idx)
+ return &pmu->type->topology[die][idx];
+ }
+
+ return NULL;
}
static umode_t
@@ -3710,8 +3726,9 @@ pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr,
int die, int zero_bus_pmu)
{
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
+ return (pmut && !pmut->iio->pci_bus_no && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode;
}
static umode_t
@@ -3727,9 +3744,10 @@ static ssize_t skx_iio_mapping_show(struct device *dev,
struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
+ struct intel_uncore_topology *pmut = pmu_topology(pmu, die);
- return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
- skx_iio_stack(pmu, die));
+ return sprintf(buf, "%04x:%02x\n", pmut ? pmut->iio->segment : 0,
+ pmut ? pmut->iio->pci_bus_no : 0);
}
static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3764,18 +3782,79 @@ static int die_to_cpu(int die)
return res;
}
-static int skx_iio_get_topology(struct intel_uncore_type *type)
+enum {
+ IIO_TOPOLOGY_TYPE,
+ UPI_TOPOLOGY_TYPE,
+ TOPOLOGY_MAX
+};
+
+static const size_t topology_size[TOPOLOGY_MAX] = {
+ sizeof(*((struct intel_uncore_topology *)NULL)->iio),
+ sizeof(*((struct intel_uncore_topology *)NULL)->upi)
+};
+
+static int pmu_alloc_topology(struct intel_uncore_type *type, int topology_type)
{
- int die, ret = -EPERM;
+ int die, idx;
+ struct intel_uncore_topology **topology;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
+ if (!type->num_boxes)
+ return -EPERM;
+
+ topology = kcalloc(uncore_max_dies(), sizeof(*topology), GFP_KERNEL);
+ if (!topology)
+ goto err;
for (die = 0; die < uncore_max_dies(); die++) {
- ret = skx_msr_cpu_bus_read(die_to_cpu(die),
- &type->topology[die].configuration);
+ topology[die] = kcalloc(type->num_boxes, sizeof(**topology), GFP_KERNEL);
+ if (!topology[die])
+ goto clear;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ topology[die][idx].untyped = kcalloc(type->num_boxes,
+ topology_size[topology_type],
+ GFP_KERNEL);
+ if (!topology[die][idx].untyped)
+ goto clear;
+ }
+ }
+
+ type->topology = topology;
+
+ return 0;
+clear:
+ for (; die >= 0; die--) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(topology[die][idx].untyped);
+ kfree(topology[die]);
+ }
+ kfree(topology);
+err:
+ return -ENOMEM;
+}
+
+static void pmu_free_topology(struct intel_uncore_type *type)
+{
+ int die, idx;
+
+ if (type->topology) {
+ for (die = 0; die < uncore_max_dies(); die++) {
+ for (idx = 0; idx < type->num_boxes; idx++)
+ kfree(type->topology[die][idx].untyped);
+ kfree(type->topology[die]);
+ }
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+}
+
+static int skx_pmu_get_topology(struct intel_uncore_type *type,
+ int (*topology_cb)(struct intel_uncore_type*, int, int, u64))
+{
+ int die, ret = -EPERM;
+ u64 cpu_bus_msr;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die), &cpu_bus_msr);
if (ret)
break;
@@ -3783,15 +3862,33 @@ static int skx_iio_get_topology(struct intel_uncore_type *type)
if (ret < 0)
break;
- type->topology[die].segment = ret;
+ ret = topology_cb(type, ret, die, cpu_bus_msr);
+ if (ret)
+ break;
}
- if (ret < 0) {
- kfree(type->topology);
- type->topology = NULL;
+ return ret;
+}
+
+static int skx_iio_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx;
+ struct intel_uncore_topology *t;
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ t = &type->topology[die][idx];
+ t->pmu_idx = idx;
+ t->iio->segment = segment;
+ t->iio->pci_bus_no = (cpu_bus_msr >> (idx * BUS_NUM_STRIDE)) & 0xff;
}
- return ret;
+ return 0;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ return skx_pmu_get_topology(type, skx_iio_topology_cb);
}
static struct attribute_group skx_iio_mapping_group = {
@@ -3803,8 +3900,25 @@ static const struct attribute_group *skx_iio_attr_update[] = {
NULL,
};
-static int
-pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+static void pmu_clear_mapping_attr(const struct attribute_group **groups,
+ struct attribute_group *ag)
+{
+ int i;
+
+ for (i = 0; groups[i]; i++) {
+ if (groups[i] == ag) {
+ for (i++; groups[i]; i++)
+ groups[i - 1] = groups[i];
+ groups[i - 1] = NULL;
+ break;
+ }
+ }
+}
+
+static void
+pmu_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag,
+ ssize_t (*show)(struct device*, struct device_attribute*, char*),
+ int topology_type)
{
char buf[64];
int ret;
@@ -3812,11 +3926,13 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
struct attribute **attrs = NULL;
struct dev_ext_attribute *eas = NULL;
- ret = type->get_topology(type);
+ ret = pmu_alloc_topology(type, topology_type);
if (ret < 0)
goto clear_attr_update;
- ret = -ENOMEM;
+ ret = type->get_topology(type);
+ if (ret < 0)
+ goto clear_topology;
/* One more for NULL. */
attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
@@ -3828,20 +3944,20 @@ pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
goto clear_attrs;
for (die = 0; die < uncore_max_dies(); die++) {
- sprintf(buf, "die%ld", die);
+ snprintf(buf, sizeof(buf), "die%ld", die);
sysfs_attr_init(&eas[die].attr.attr);
eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
if (!eas[die].attr.attr.name)
goto err;
eas[die].attr.attr.mode = 0444;
- eas[die].attr.show = skx_iio_mapping_show;
+ eas[die].attr.show = show;
eas[die].attr.store = NULL;
eas[die].var = (void *)die;
attrs[die] = &eas[die].attr.attr;
}
ag->attrs = attrs;
- return 0;
+ return;
err:
for (; die >= 0; die--)
kfree(eas[die].attr.attr.name);
@@ -3849,14 +3965,13 @@ err:
clear_attrs:
kfree(attrs);
clear_topology:
- kfree(type->topology);
+ pmu_free_topology(type);
clear_attr_update:
- type->attr_update = NULL;
- return ret;
+ pmu_clear_mapping_attr(type->attr_update, ag);
}
static void
-pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+pmu_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
{
struct attribute **attr = ag->attrs;
@@ -3868,17 +3983,23 @@ pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *
kfree(attr_to_ext_attr(*ag->attrs));
kfree(ag->attrs);
ag->attrs = NULL;
- kfree(type->topology);
+ pmu_free_topology(type);
+}
+
+static void
+pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_iio_mapping_show, IIO_TOPOLOGY_TYPE);
}
-static int skx_iio_set_mapping(struct intel_uncore_type *type)
+static void skx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &skx_iio_mapping_group);
+ pmu_iio_set_mapping(type, &skx_iio_mapping_group);
}
static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &skx_iio_mapping_group);
}
static struct intel_uncore_type skx_uncore_iio = {
@@ -4139,6 +4260,132 @@ static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
.read_counter = snbep_uncore_pci_read_counter,
};
+static umode_t
+skx_upi_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ return pmu->type->topology[die][pmu->pmu_idx].upi->enabled ? attr->mode : 0;
+}
+
+static ssize_t skx_upi_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+ struct uncore_upi_topology *upi = pmu->type->topology[die][pmu->pmu_idx].upi;
+
+ return sysfs_emit(buf, "upi_%d,die_%d\n", upi->pmu_idx_to, upi->die_to);
+}
+
+#define SKX_UPI_REG_DID 0x2058
+#define SKX_UPI_REGS_ADDR_DEVICE_LINK0 0x0e
+#define SKX_UPI_REGS_ADDR_FUNCTION 0x00
+
+/*
+ * UPI Link Parameter 0
+ * | Bit | Default | Description
+ * | 19:16 | 0h | base_nodeid - The NodeID of the sending socket.
+ * | 12:8 | 00h | sending_port - The processor die port number of the sending port.
+ */
+#define SKX_KTILP0_OFFSET 0x94
+
+/*
+ * UPI Pcode Status. This register is used by PCode to store the link training status.
+ * | Bit | Default | Description
+ * | 4 | 0h | ll_status_valid — Bit indicates the valid training status
+ * logged from PCode to the BIOS.
+ */
+#define SKX_KTIPCSTS_OFFSET 0x120
+
+static int upi_fill_topology(struct pci_dev *dev, struct intel_uncore_topology *tp,
+ int pmu_idx)
+{
+ int ret;
+ u32 upi_conf;
+ struct uncore_upi_topology *upi = tp->upi;
+
+ tp->pmu_idx = pmu_idx;
+ ret = pci_read_config_dword(dev, SKX_KTIPCSTS_OFFSET, &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->enabled = (upi_conf >> 4) & 1;
+ if (upi->enabled) {
+ ret = pci_read_config_dword(dev, SKX_KTILP0_OFFSET,
+ &upi_conf);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ goto err;
+ }
+ upi->die_to = (upi_conf >> 16) & 0xf;
+ upi->pmu_idx_to = (upi_conf >> 8) & 0x1f;
+ }
+err:
+ return ret;
+}
+
+static int skx_upi_topology_cb(struct intel_uncore_type *type, int segment,
+ int die, u64 cpu_bus_msr)
+{
+ int idx, ret;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+ struct pci_dev *dev = NULL;
+ u8 bus = cpu_bus_msr >> (3 * BUS_NUM_STRIDE);
+
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[die][idx];
+ devfn = PCI_DEVFN(SKX_UPI_REGS_ADDR_DEVICE_LINK0 + idx,
+ SKX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(segment, bus, devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ break;
+ }
+ }
+
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int skx_upi_get_topology(struct intel_uncore_type *type)
+{
+ /* CPX case is not supported */
+ if (boot_cpu_data.x86_stepping == 11)
+ return -EPERM;
+
+ return skx_pmu_get_topology(type, skx_upi_topology_cb);
+}
+
+static struct attribute_group skx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *skx_upi_attr_update[] = {
+ &skx_upi_mapping_group,
+ NULL
+};
+
+static void
+pmu_upi_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag)
+{
+ pmu_set_mapping(type, ag, skx_upi_mapping_show, UPI_TOPOLOGY_TYPE);
+}
+
+static void skx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &skx_upi_mapping_group);
+}
+
+static void skx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &skx_upi_mapping_group);
+}
+
static struct intel_uncore_type skx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -4151,6 +4398,10 @@ static struct intel_uncore_type skx_uncore_upi = {
.box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &skx_upi_uncore_format_group,
+ .attr_update = skx_upi_attr_update,
+ .get_topology = skx_upi_get_topology,
+ .set_mapping = skx_upi_set_mapping,
+ .cleanup_mapping = skx_upi_cleanup_mapping,
};
static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
@@ -4461,11 +4712,6 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
int die, stack_id, ret = -EPERM;
struct pci_dev *dev = NULL;
- type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
- GFP_KERNEL);
- if (!type->topology)
- return -ENOMEM;
-
while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) {
ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg);
if (ret) {
@@ -4483,14 +4729,12 @@ static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_map
/* Convert stack id from SAD_CONTROL to PMON notation. */
stack_id = sad_pmon_mapping[stack_id];
- ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number;
- type->topology[die].segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].iio->segment = pci_domain_nr(dev->bus);
+ type->topology[die][stack_id].pmu_idx = stack_id;
+ type->topology[die][stack_id].iio->pci_bus_no = dev->bus->number;
}
- if (ret) {
- kfree(type->topology);
- type->topology = NULL;
- }
+ pci_dev_put(dev);
return ret;
}
@@ -4519,14 +4763,14 @@ static int snr_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, snr_sad_pmon_mapping);
}
-static int snr_iio_set_mapping(struct intel_uncore_type *type)
+static void snr_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &snr_iio_mapping_group);
+ pmu_iio_set_mapping(type, &snr_iio_mapping_group);
}
static void snr_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group);
+ pmu_cleanup_mapping(type, &snr_iio_mapping_group);
}
static struct event_constraint snr_uncore_iio_constraints[] = {
@@ -4857,6 +5101,8 @@ static int snr_uncore_mmio_map(struct intel_uncore_box *box,
addr += box_ctl;
+ pci_dev_put(pdev);
+
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr) {
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
@@ -5137,14 +5383,19 @@ static int icx_iio_get_topology(struct intel_uncore_type *type)
return sad_cfg_iio_topology(type, icx_sad_pmon_mapping);
}
-static int icx_iio_set_mapping(struct intel_uncore_type *type)
+static void icx_iio_set_mapping(struct intel_uncore_type *type)
{
- return pmu_iio_set_mapping(type, &icx_iio_mapping_group);
+ /* Detect ICX-D system. This case is not supported */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ICELAKE_D) {
+ pmu_clear_mapping_attr(type->attr_update, &icx_iio_mapping_group);
+ return;
+ }
+ pmu_iio_set_mapping(type, &icx_iio_mapping_group);
}
static void icx_iio_cleanup_mapping(struct intel_uncore_type *type)
{
- pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group);
+ pmu_cleanup_mapping(type, &icx_iio_mapping_group);
}
static struct intel_uncore_type icx_uncore_iio = {
@@ -5337,6 +5588,76 @@ static const struct attribute_group icx_upi_uncore_format_group = {
.attrs = icx_upi_uncore_formats_attr,
};
+#define ICX_UPI_REGS_ADDR_DEVICE_LINK0 0x02
+#define ICX_UPI_REGS_ADDR_FUNCTION 0x01
+
+static int discover_upi_topology(struct intel_uncore_type *type, int ubox_did, int dev_link0)
+{
+ struct pci_dev *ubox = NULL;
+ struct pci_dev *dev = NULL;
+ u32 nid, gid;
+ int i, idx, ret = -EPERM;
+ struct intel_uncore_topology *upi;
+ unsigned int devfn;
+
+ /* GIDNIDMAP method supports machines which have less than 8 sockets. */
+ if (uncore_max_dies() > 8)
+ goto err;
+
+ while ((ubox = pci_get_device(PCI_VENDOR_ID_INTEL, ubox_did, ubox))) {
+ ret = upi_nodeid_groupid(ubox, SKX_CPUNODEID, SKX_GIDNIDMAP, &nid, &gid);
+ if (ret) {
+ ret = pcibios_err_to_errno(ret);
+ break;
+ }
+
+ for (i = 0; i < 8; i++) {
+ if (nid != GIDNIDMAP(gid, i))
+ continue;
+ for (idx = 0; idx < type->num_boxes; idx++) {
+ upi = &type->topology[nid][idx];
+ devfn = PCI_DEVFN(dev_link0 + idx, ICX_UPI_REGS_ADDR_FUNCTION);
+ dev = pci_get_domain_bus_and_slot(pci_domain_nr(ubox->bus),
+ ubox->bus->number,
+ devfn);
+ if (dev) {
+ ret = upi_fill_topology(dev, upi, idx);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+ }
+err:
+ pci_dev_put(ubox);
+ pci_dev_put(dev);
+ return ret;
+}
+
+static int icx_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, ICX_UBOX_DID, ICX_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
+static struct attribute_group icx_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
+};
+
+static const struct attribute_group *icx_upi_attr_update[] = {
+ &icx_upi_mapping_group,
+ NULL
+};
+
+static void icx_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &icx_upi_mapping_group);
+}
+
+static void icx_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &icx_upi_mapping_group);
+}
+
static struct intel_uncore_type icx_uncore_upi = {
.name = "upi",
.num_counters = 4,
@@ -5349,6 +5670,10 @@ static struct intel_uncore_type icx_uncore_upi = {
.box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
.ops = &skx_upi_uncore_pci_ops,
.format_group = &icx_upi_uncore_format_group,
+ .attr_update = icx_upi_attr_update,
+ .get_topology = icx_upi_get_topology,
+ .set_mapping = icx_upi_set_mapping,
+ .cleanup_mapping = icx_upi_cleanup_mapping,
};
static struct event_constraint icx_uncore_m3upi_constraints[] = {
@@ -5743,6 +6068,17 @@ static struct intel_uncore_ops spr_uncore_mmio_ops = {
.read_counter = uncore_mmio_read_counter,
};
+static struct uncore_event_desc spr_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x05,umask=0xcf"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x05,umask=0xf0"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+ { /* end: all zeroes */ },
+};
+
static struct intel_uncore_type spr_uncore_imc = {
SPR_UNCORE_COMMON_FORMAT(),
.name = "imc",
@@ -5750,6 +6086,7 @@ static struct intel_uncore_type spr_uncore_imc = {
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
.ops = &spr_uncore_mmio_ops,
+ .event_descs = spr_uncore_imc_events,
};
static void spr_uncore_pci_enable_event(struct intel_uncore_box *box,
@@ -5780,26 +6117,49 @@ static struct intel_uncore_type spr_uncore_m2m = {
.name = "m2m",
};
-static struct intel_uncore_type spr_uncore_upi = {
- SPR_UNCORE_PCI_COMMON_FORMAT(),
- .name = "upi",
+static struct attribute_group spr_upi_mapping_group = {
+ .is_visible = skx_upi_mapping_visible,
};
-static struct intel_uncore_type spr_uncore_m3upi = {
- SPR_UNCORE_PCI_COMMON_FORMAT(),
- .name = "m3upi",
- .constraints = icx_uncore_m3upi_constraints,
+static const struct attribute_group *spr_upi_attr_update[] = {
+ &uncore_alias_group,
+ &spr_upi_mapping_group,
+ NULL
};
+#define SPR_UPI_REGS_ADDR_DEVICE_LINK0 0x01
+
+static void spr_upi_set_mapping(struct intel_uncore_type *type)
+{
+ pmu_upi_set_mapping(type, &spr_upi_mapping_group);
+}
+
+static void spr_upi_cleanup_mapping(struct intel_uncore_type *type)
+{
+ pmu_cleanup_mapping(type, &spr_upi_mapping_group);
+}
+
+static int spr_upi_get_topology(struct intel_uncore_type *type)
+{
+ return discover_upi_topology(type, SPR_UBOX_DID, SPR_UPI_REGS_ADDR_DEVICE_LINK0);
+}
+
static struct intel_uncore_type spr_uncore_mdf = {
SPR_UNCORE_COMMON_FORMAT(),
.name = "mdf",
};
#define UNCORE_SPR_NUM_UNCORE_TYPES 12
+#define UNCORE_SPR_CHA 0
#define UNCORE_SPR_IIO 1
#define UNCORE_SPR_IMC 6
+#define UNCORE_SPR_UPI 8
+#define UNCORE_SPR_M3UPI 9
+/*
+ * The uncore units, which are supported by the discovery table,
+ * are defined here.
+ */
static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
&spr_uncore_chabox,
&spr_uncore_iio,
@@ -5809,12 +6169,56 @@ static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = {
NULL,
&spr_uncore_imc,
&spr_uncore_m2m,
- &spr_uncore_upi,
- &spr_uncore_m3upi,
+ NULL,
+ NULL,
NULL,
&spr_uncore_mdf,
};
+/*
+ * The uncore units, which are not supported by the discovery table,
+ * are implemented from here.
+ */
+#define SPR_UNCORE_UPI_NUM_BOXES 4
+
+static unsigned int spr_upi_pci_offsets[SPR_UNCORE_UPI_NUM_BOXES] = {
+ 0, 0x8000, 0x10000, 0x18000
+};
+
+static struct intel_uncore_type spr_uncore_upi = {
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SPR_RAW_EVENT_MASK_EXT,
+ .format_group = &spr_uncore_raw_format_group,
+ .ops = &spr_uncore_pci_ops,
+ .name = "upi",
+ .attr_update = spr_upi_attr_update,
+ .get_topology = spr_upi_get_topology,
+ .set_mapping = spr_upi_set_mapping,
+ .cleanup_mapping = spr_upi_cleanup_mapping,
+ .type_id = UNCORE_SPR_UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_UPI_PCI_PMON_CTL0,
+ .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+};
+
+static struct intel_uncore_type spr_uncore_m3upi = {
+ SPR_UNCORE_PCI_COMMON_FORMAT(),
+ .name = "m3upi",
+ .type_id = UNCORE_SPR_M3UPI,
+ .num_counters = 4,
+ .num_boxes = SPR_UNCORE_UPI_NUM_BOXES,
+ .perf_ctr_bits = 48,
+ .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0,
+ .event_ctl = ICX_M3UPI_PCI_PMON_CTL0,
+ .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL,
+ .pci_offsets = spr_upi_pci_offsets,
+ .constraints = icx_uncore_m3upi_constraints,
+};
+
enum perf_uncore_spr_iio_freerunning_type_id {
SPR_IIO_MSR_IOCLK,
SPR_IIO_MSR_BW_IN,
@@ -5945,6 +6349,7 @@ static struct intel_uncore_type spr_uncore_imc_free_running = {
#define UNCORE_SPR_MSR_EXTRA_UNCORES 1
#define UNCORE_SPR_MMIO_EXTRA_UNCORES 1
+#define UNCORE_SPR_PCI_EXTRA_UNCORES 2
static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = {
&spr_uncore_iio_free_running,
@@ -5954,6 +6359,17 @@ static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES]
&spr_uncore_imc_free_running,
};
+static struct intel_uncore_type *spr_pci_uncores[UNCORE_SPR_PCI_EXTRA_UNCORES] = {
+ &spr_uncore_upi,
+ &spr_uncore_m3upi
+};
+
+int spr_uncore_units_ignore[] = {
+ UNCORE_SPR_UPI,
+ UNCORE_SPR_M3UPI,
+ UNCORE_IGNORE_END
+};
+
static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
struct intel_uncore_type *from_type)
{
@@ -5986,6 +6402,12 @@ static void uncore_type_customized_copy(struct intel_uncore_type *to_type,
to_type->format_group = from_type->format_group;
if (from_type->attr_update)
to_type->attr_update = from_type->attr_update;
+ if (from_type->set_mapping)
+ to_type->set_mapping = from_type->set_mapping;
+ if (from_type->get_topology)
+ to_type->get_topology = from_type->get_topology;
+ if (from_type->cleanup_mapping)
+ to_type->cleanup_mapping = from_type->cleanup_mapping;
}
static struct intel_uncore_type **
@@ -6039,18 +6461,88 @@ static int uncore_type_max_boxes(struct intel_uncore_type **types,
return max + 1;
}
+#define SPR_MSR_UNC_CBO_CONFIG 0x2FFE
+
void spr_uncore_cpu_init(void)
{
+ struct intel_uncore_type *type;
+ u64 num_cbo;
+
uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR,
UNCORE_SPR_MSR_EXTRA_UNCORES,
spr_msr_uncores);
+ type = uncore_find_type_by_id(uncore_msr_uncores, UNCORE_SPR_CHA);
+ if (type) {
+ rdmsrl(SPR_MSR_UNC_CBO_CONFIG, num_cbo);
+ type->num_boxes = num_cbo;
+ }
spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO);
}
+#define SPR_UNCORE_UPI_PCIID 0x3241
+#define SPR_UNCORE_UPI0_DEVFN 0x9
+#define SPR_UNCORE_M3UPI_PCIID 0x3246
+#define SPR_UNCORE_M3UPI0_DEVFN 0x29
+
+static void spr_update_device_location(int type_id)
+{
+ struct intel_uncore_type *type;
+ struct pci_dev *dev = NULL;
+ u32 device, devfn;
+ u64 *ctls;
+ int die;
+
+ if (type_id == UNCORE_SPR_UPI) {
+ type = &spr_uncore_upi;
+ device = SPR_UNCORE_UPI_PCIID;
+ devfn = SPR_UNCORE_UPI0_DEVFN;
+ } else if (type_id == UNCORE_SPR_M3UPI) {
+ type = &spr_uncore_m3upi;
+ device = SPR_UNCORE_M3UPI_PCIID;
+ devfn = SPR_UNCORE_M3UPI0_DEVFN;
+ } else
+ return;
+
+ ctls = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+ if (!ctls) {
+ type->num_boxes = 0;
+ return;
+ }
+
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+ if (devfn != dev->devfn)
+ continue;
+
+ die = uncore_device_to_die(dev);
+ if (die < 0)
+ continue;
+
+ ctls[die] = pci_domain_nr(dev->bus) << UNCORE_DISCOVERY_PCI_DOMAIN_OFFSET |
+ dev->bus->number << UNCORE_DISCOVERY_PCI_BUS_OFFSET |
+ devfn << UNCORE_DISCOVERY_PCI_DEVFN_OFFSET |
+ type->box_ctl;
+ }
+
+ type->box_ctls = ctls;
+}
+
int spr_uncore_pci_init(void)
{
- uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL);
+ /*
+ * The discovery table of UPI on some SPR variant is broken,
+ * which impacts the detection of both UPI and M3UPI uncore PMON.
+ * Use the pre-defined UPI and M3UPI table to replace.
+ *
+ * The accurate location, e.g., domain and BUS number,
+ * can only be retrieved at load time.
+ * Update the location of UPI and M3UPI.
+ */
+ spr_update_device_location(UNCORE_SPR_UPI);
+ spr_update_device_location(UNCORE_SPR_M3UPI);
+ uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI,
+ UNCORE_SPR_PCI_EXTRA_UNCORES,
+ spr_pci_uncores);
return 0;
}
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index ecced3a52668..0feaaa571303 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -69,6 +69,9 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_BROADWELL_G:
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ case INTEL_FAM6_EMERALDRAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_D:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_D:
@@ -107,6 +110,8 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
+ case INTEL_FAM6_METEORLAKE:
+ case INTEL_FAM6_METEORLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 332d2e6d8ae4..d6de4487348c 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -35,15 +35,17 @@
* per-core reg tables.
*/
enum extra_reg_type {
- EXTRA_REG_NONE = -1, /* not used */
+ EXTRA_REG_NONE = -1, /* not used */
- EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
- EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
- EXTRA_REG_LBR = 2, /* lbr_select */
- EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
- EXTRA_REG_FE = 4, /* fe_* */
+ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
+ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
+ EXTRA_REG_LBR = 2, /* lbr_select */
+ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
+ EXTRA_REG_SNOOP_0 = 5, /* snoop response 0 */
+ EXTRA_REG_SNOOP_1 = 6, /* snoop response 1 */
- EXTRA_REG_MAX /* number of entries needed */
+ EXTRA_REG_MAX /* number of entries needed */
};
struct event_constraint {
@@ -606,6 +608,7 @@ union perf_capabilities {
u64 pebs_baseline:1;
u64 perf_metrics:1;
u64 pebs_output_pt_available:1;
+ u64 pebs_timing_info:1;
u64 anythread_deprecated:1;
};
u64 capabilities;
@@ -647,6 +650,7 @@ enum {
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x10
+#define PERF_PEBS_DATA_SOURCE_MASK (PERF_PEBS_DATA_SOURCE_MAX - 1)
struct x86_hybrid_pmu {
struct pmu pmu;
@@ -811,7 +815,7 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*sched_task)(struct perf_event_context *ctx,
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx,
bool sched_in);
/*
@@ -894,12 +898,12 @@ struct x86_pmu {
int num_topdown_events;
/*
- * perf task context (i.e. struct perf_event_context::task_ctx_data)
+ * perf task context (i.e. struct perf_event_pmu_context::task_ctx_data)
* switch helper to bridge calls from perf/core to perf/x86.
* See struct pmu::swap_task_ctx() usage for examples;
*/
- void (*swap_task_ctx)(struct perf_event_context *prev,
- struct perf_event_context *next);
+ void (*swap_task_ctx)(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
/*
* AMD bits
@@ -925,7 +929,7 @@ struct x86_pmu {
int (*aux_output_match) (struct perf_event *event);
- int (*filter_match)(struct perf_event *event);
+ void (*filter)(struct pmu *pmu, int cpu, bool *ret);
/*
* Hybrid support
*
@@ -1000,6 +1004,7 @@ do { \
#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */
#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */
+#define PMU_FL_RETIRE_LATENCY 0x200 /* Support Retire Latency in PEBS */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -1180,8 +1185,6 @@ int x86_pmu_handle_irq(struct pt_regs *regs);
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
u64 intel_ctrl);
-void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
-
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@@ -1306,7 +1309,7 @@ void amd_pmu_lbr_reset(void);
void amd_pmu_lbr_read(void);
void amd_pmu_lbr_add(struct perf_event *event);
void amd_pmu_lbr_del(struct perf_event *event);
-void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event);
@@ -1322,7 +1325,6 @@ void amd_brs_enable_all(void);
void amd_brs_disable_all(void);
void amd_brs_drain(void);
void amd_brs_lopwr_init(void);
-void amd_brs_disable_all(void);
int amd_brs_hw_config(struct perf_event *event);
void amd_brs_reset(void);
@@ -1330,7 +1332,7 @@ static inline void amd_pmu_brs_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- perf_sched_cb_inc(event->ctx->pmu);
+ perf_sched_cb_inc(event->pmu);
cpuc->lbr_users++;
/*
* No need to reset BRS because it is reset
@@ -1345,10 +1347,10 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
- perf_sched_cb_dec(event->ctx->pmu);
+ perf_sched_cb_dec(event->pmu);
}
-void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
#else
static inline int amd_brs_init(void)
{
@@ -1373,7 +1375,7 @@ static inline void amd_pmu_brs_del(struct perf_event *event)
{
}
-static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
+static inline void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
}
@@ -1489,6 +1491,8 @@ int intel_pmu_drain_bts_buffer(void);
u64 adl_latency_data_small(struct perf_event *event, u64 status);
+u64 mtl_latency_data_small(struct perf_event *event, u64 status);
+
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
@@ -1533,7 +1537,7 @@ void intel_pmu_pebs_enable_all(void);
void intel_pmu_pebs_disable_all(void);
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
void intel_pmu_auto_reload_read(struct perf_event *event);
@@ -1541,10 +1545,10 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
void intel_ds_init(void);
-void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
- struct perf_event_context *next);
+void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
+ struct perf_event_pmu_context *next_epc);
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
u64 lbr_from_signext_quirk_wr(u64 val);
@@ -1600,6 +1604,8 @@ void intel_pmu_pebs_data_source_adl(void);
void intel_pmu_pebs_data_source_grt(void);
+void intel_pmu_pebs_data_source_mtl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index a829492bca4c..52e6e7ed4f78 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -800,13 +800,18 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
{},
};
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index 949d845c922b..3e9acdaeed1e 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -541,7 +541,13 @@ __init int zhaoxin_pmu_init(void)
switch (boot_cpu_data.x86) {
case 0x06:
- if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) {
+ /*
+ * Support Zhaoxin CPU from ZXC series, exclude Nano series through FMS.
+ * Nano FMS: Family=6, Model=F, Stepping=[0-A][C-D]
+ * ZXC FMS: Family=6, Model=F, Stepping=E-F OR Family=6, Model=0x19, Stepping=0-3
+ */
+ if ((boot_cpu_data.x86_model == 0x0f && boot_cpu_data.x86_stepping >= 0x0e) ||
+ boot_cpu_data.x86_model == 0x19) {
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 5d2de10809ae..3a1548054b48 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o
+obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..1fbda2f94184 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -96,6 +96,11 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
wrmsr(HV_X64_MSR_EOI, val, 0);
}
+static bool cpu_is_self(int cpu)
+{
+ return cpu == smp_processor_id();
+}
+
/*
* IPI implementation on Hyper-V.
*/
@@ -128,10 +133,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
*/
if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- if (exclude_self)
- nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
- else
- nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+
+ nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask,
+ exclude_self ? cpu_is_self : NULL);
/*
* 'nr_bank <= 0' means some CPUs in cpumask can't be
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index f49bc3ec76e6..6c04b52f139b 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -29,7 +29,6 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
-#include <linux/swiotlb.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -64,7 +63,10 @@ static int hyperv_init_ghcb(void)
* memory boundary and map it here.
*/
rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
- ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB);
+
+ /* Mask out vTOM bit. ioremap_cache() maps decrypted */
+ ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary;
+ ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE);
if (!ghcb_va)
return -ENOMEM;
@@ -77,7 +79,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
+ struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
int ret;
ret = hv_common_cpu_init(cpu);
@@ -87,34 +89,32 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
- if (!*hvp) {
- if (hv_root_partition) {
- /*
- * For root partition we get the hypervisor provided VP assist
- * page, instead of allocating a new page.
- */
- rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- *hvp = memremap(msr.pfn <<
- HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
- PAGE_SIZE, MEMREMAP_WB);
- } else {
- /*
- * The VP assist page is an "overlay" page (see Hyper-V TLFS's
- * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
- * out to make sure we always write the EOI MSR in
- * hv_apic_eoi_write() *after* the EOI optimization is disabled
- * in hv_cpu_die(), otherwise a CPU may not be stopped in the
- * case of CPU offlining and the VM will hang.
- */
+ if (hv_root_partition) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ if (!*hvp)
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
- if (*hvp)
- msr.pfn = vmalloc_to_pfn(*hvp);
- }
- WARN_ON(!(*hvp));
- if (*hvp) {
- msr.enable = 1;
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
- }
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+
+ }
+ if (!WARN_ON(!(*hvp))) {
+ msr.enable = 1;
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
return hyperv_init_ghcb();
@@ -220,7 +220,7 @@ static int hv_cpu_die(unsigned int cpu)
if (hv_ghcb_pg) {
ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
if (*ghcb_va)
- memunmap(*ghcb_va);
+ iounmap(*ghcb_va);
*ghcb_va = NULL;
}
@@ -416,7 +416,7 @@ void __init hyperv_init(void)
goto free_vp_assist_page;
}
- cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
+ cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online",
hv_cpu_init, hv_cpu_die);
if (cpuhp < 0)
goto free_ghcb_page;
@@ -464,6 +464,8 @@ void __init hyperv_init(void)
BUG_ON(!src);
memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE);
memunmap(src);
+
+ hv_remap_tsc_clocksource();
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
@@ -504,16 +506,6 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
-#ifdef CONFIG_SWIOTLB
- /*
- * Swiotlb bounce buffer needs to be mapped in extra address
- * space. Map function doesn't work in the early place and so
- * call swiotlb_update_mem_attributes() here.
- */
- if (hv_is_isolation_supported())
- swiotlb_update_mem_attributes();
-#endif
-
return;
clean_guest_os_id:
@@ -537,8 +529,6 @@ void hyperv_cleanup(void)
union hv_x64_msr_hypercall_contents hypercall_msr;
union hv_reference_tsc_msr tsc_msr;
- unregister_syscore_ops(&hv_syscore_ops);
-
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
new file mode 100644
index 000000000000..85d38b9f3586
--- /dev/null
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ * Saurabh Sengar <ssengar@microsoft.com>
+ */
+
+#include <asm/apic.h>
+#include <asm/boot.h>
+#include <asm/desc.h>
+#include <asm/i8259.h>
+#include <asm/mshyperv.h>
+#include <asm/realmode.h>
+
+extern struct boot_params boot_params;
+static struct real_mode_header hv_vtl_real_mode_header;
+
+void __init hv_vtl_init_platform(void)
+{
+ pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
+
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = x86_init_noop;
+
+ x86_platform.get_wallclock = get_rtc_noop;
+ x86_platform.set_wallclock = set_rtc_noop;
+ x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.reserve_bios_regions = 0;
+ x86_platform.legacy.devices.pnpbios = 0;
+}
+
+static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
+{
+ return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
+ (desc->base1 << 16) | desc->base0;
+}
+
+static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
+{
+ return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
+}
+
+typedef void (*secondary_startup_64_fn)(void*, void*);
+static void hv_vtl_ap_entry(void)
+{
+ ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
+}
+
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
+{
+ u64 status;
+ int ret = 0;
+ struct hv_enable_vp_vtl *input;
+ unsigned long irq_flags;
+
+ struct desc_ptr gdt_ptr;
+ struct desc_ptr idt_ptr;
+
+ struct ldttss_desc *tss;
+ struct ldttss_desc *ldt;
+ struct desc_struct *gdt;
+
+ u64 rsp = current->thread.sp;
+ u64 rip = (u64)&hv_vtl_ap_entry;
+
+ native_store_gdt(&gdt_ptr);
+ store_idt(&idt_ptr);
+
+ gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
+ tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+ ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->vp_index = target_vp_index;
+ input->target_vtl.target_vtl = HV_VTL_MGMT;
+
+ /*
+ * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
+ * mode transition sequence after waking up an AP with SIPI whose
+ * vector points to the 16-bit AP startup trampoline code. Here in
+ * VTL2, we can't perform that sequence as the AP has to start in
+ * the 64-bit mode.
+ *
+ * To make this happen, we tell the hypervisor to load a valid 64-bit
+ * context (most of which is just magic numbers from the CPU manual)
+ * so that AP jumps right to the 64-bit entry of the kernel, and the
+ * control registers are loaded with values that let the AP fetch the
+ * code and data and carry on with work it gets assigned.
+ */
+
+ input->vp_context.rip = rip;
+ input->vp_context.rsp = rsp;
+ input->vp_context.rflags = 0x0000000000000002;
+ input->vp_context.efer = __rdmsr(MSR_EFER);
+ input->vp_context.cr0 = native_read_cr0();
+ input->vp_context.cr3 = __native_read_cr3();
+ input->vp_context.cr4 = native_read_cr4();
+ input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
+ input->vp_context.idtr.limit = idt_ptr.size;
+ input->vp_context.idtr.base = idt_ptr.address;
+ input->vp_context.gdtr.limit = gdt_ptr.size;
+ input->vp_context.gdtr.base = gdt_ptr.address;
+
+ /* Non-system desc (64bit), long, code, present */
+ input->vp_context.cs.selector = __KERNEL_CS;
+ input->vp_context.cs.base = 0;
+ input->vp_context.cs.limit = 0xffffffff;
+ input->vp_context.cs.attributes = 0xa09b;
+ /* Non-system desc (64bit), data, present, granularity, default */
+ input->vp_context.ss.selector = __KERNEL_DS;
+ input->vp_context.ss.base = 0;
+ input->vp_context.ss.limit = 0xffffffff;
+ input->vp_context.ss.attributes = 0xc093;
+
+ /* System desc (128bit), present, LDT */
+ input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
+ input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
+ input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
+ input->vp_context.ldtr.attributes = 0x82;
+
+ /* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
+ input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
+ input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
+ input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
+ input->vp_context.tr.attributes = 0x8b;
+
+ status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
+
+ if (!hv_result_success(status) &&
+ hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
+ pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
+ target_vp_index, status);
+ ret = -EINVAL;
+ goto free_lock;
+ }
+
+ status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
+
+ if (!hv_result_success(status)) {
+ pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
+ target_vp_index, status);
+ ret = -EINVAL;
+ }
+
+free_lock:
+ local_irq_restore(irq_flags);
+
+ return ret;
+}
+
+static int hv_vtl_apicid_to_vp_id(u32 apic_id)
+{
+ u64 control;
+ u64 status;
+ unsigned long irq_flags;
+ struct hv_get_vp_from_apic_id_in *input;
+ u32 *output, ret;
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->apic_ids[0] = apic_id;
+
+ output = (u32 *)input;
+
+ control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
+ status = hv_do_hypercall(control, input, output);
+ ret = output[0];
+
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("failed to get vp id from apic id %d, status %#llx\n",
+ apic_id, status);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+ int vp_id;
+
+ pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
+ vp_id = hv_vtl_apicid_to_vp_id(apicid);
+
+ if (vp_id < 0) {
+ pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
+ return -EINVAL;
+ }
+ if (vp_id > ms_hyperv.max_vp_index) {
+ pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
+ return -EINVAL;
+ }
+
+ return hv_vtl_bringup_vcpu(vp_id, start_eip);
+}
+
+static int __init hv_vtl_early_init(void)
+{
+ /*
+ * `boot_cpu_has` returns the runtime feature support,
+ * and here is the earliest it can be used.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+ panic("XSAVE has to be disabled as it is not supported by this module.\n"
+ "Please add 'noxsave' to the kernel command line.\n");
+
+ real_mode_header = &hv_vtl_real_mode_header;
+ apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu;
+
+ return 0;
+}
+early_initcall(hv_vtl_early_init);
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 1dbcbd9da74d..cc92388b7a99 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -13,6 +13,8 @@
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
+#include <asm/coco.h>
+#include <asm/mem_encrypt.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
@@ -127,7 +129,7 @@ static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code,
return ES_OK;
}
-void hv_ghcb_terminate(unsigned int set, unsigned int reason)
+void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason)
{
u64 val = GHCB_MSR_TERM_REQ;
@@ -233,41 +235,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
-#endif
-
-enum hv_isolation_type hv_get_isolation_type(void)
-{
- if (!(ms_hyperv.priv_high & HV_ISOLATION))
- return HV_ISOLATION_TYPE_NONE;
- return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
-}
-EXPORT_SYMBOL_GPL(hv_get_isolation_type);
-
-/*
- * hv_is_isolation_supported - Check system runs in the Hyper-V
- * isolation VM.
- */
-bool hv_is_isolation_supported(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
- return false;
-
- if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
- return false;
-
- return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
-}
-
-DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
-
-/*
- * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
- * isolation VM.
- */
-bool hv_isolation_type_snp(void)
-{
- return static_branch_unlikely(&isolation_type_snp);
-}
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -320,27 +287,25 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
}
/*
- * hv_set_mem_host_visibility - Set specified memory visible to host.
+ * hv_vtom_set_host_visibility - Set specified memory visible to host.
*
* In Isolation VM, all guest memory is encrypted from host and guest
* needs to set memory visible to host via hvcall before sharing memory
* with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size.
*/
-int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible)
+static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{
- enum hv_mem_host_visibility visibility = visible ?
- VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE;
+ enum hv_mem_host_visibility visibility = enc ?
+ VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
int ret = 0;
+ bool result = true;
int i, pfn;
- if (!hv_is_isolation_supported() || !hv_hypercall_pg)
- return 0;
-
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array)
- return -ENOMEM;
+ return false;
for (i = 0, pfn = 0; i < pagecount; i++) {
pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
@@ -349,41 +314,98 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility);
- if (ret)
+ if (ret) {
+ result = false;
goto err_free_pfn_array;
+ }
pfn = 0;
}
}
err_free_pfn_array:
kfree(pfn_array);
- return ret;
+ return result;
}
-/*
- * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
- */
-void *hv_map_memory(void *addr, unsigned long size)
+static bool hv_vtom_tlb_flush_required(bool private)
{
- unsigned long *pfns = kcalloc(size / PAGE_SIZE,
- sizeof(unsigned long), GFP_KERNEL);
- void *vaddr;
- int i;
+ return true;
+}
+
+static bool hv_vtom_cache_flush_required(void)
+{
+ return false;
+}
- if (!pfns)
- return NULL;
+static bool hv_is_private_mmio(u64 addr)
+{
+ /*
+ * Hyper-V always provides a single IO-APIC in a guest VM.
+ * When a paravisor is used, it is emulated by the paravisor
+ * in the guest context and must be mapped private.
+ */
+ if (addr >= HV_IOAPIC_BASE_ADDRESS &&
+ addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ /* Same with a vTPM */
+ if (addr >= VTPM_BASE_ADDRESS &&
+ addr < (VTPM_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ return false;
+}
+
+void __init hv_vtom_init(void)
+{
+ /*
+ * By design, a VM using vTOM doesn't see the SEV setting,
+ * so SEV initialization is bypassed and sev_status isn't set.
+ * Set it here to indicate a vTOM VM.
+ */
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_set_vendor(CC_VENDOR_AMD);
+ cc_set_mask(ms_hyperv.shared_gpa_boundary);
+ physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
+
+ x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
+ x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
+ x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+}
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
- for (i = 0; i < size / PAGE_SIZE; i++)
- pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
- (ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
+/*
+ * hv_is_isolation_supported - Check system runs in the Hyper-V
+ * isolation VM.
+ */
+bool hv_is_isolation_supported(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return false;
- vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
- kfree(pfns);
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ return false;
- return vaddr;
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
-void hv_unmap_memory(void *addr)
+DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+
+/*
+ * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * isolation VM.
+ */
+bool hv_isolation_type_snp(void)
{
- vunmap(addr);
+ return static_branch_unlikely(&isolation_type_snp);
}
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 0ad2378fe6ad..8460bd35e10c 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -52,6 +52,11 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}
+static bool cpu_is_lazy(int cpu)
+{
+ return per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+}
+
static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
const struct flush_tlb_info *info)
{
@@ -60,6 +65,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
struct hv_tlb_flush *flush;
u64 status;
unsigned long flags;
+ bool do_lazy = !info->freed_tables;
trace_hyperv_mmu_flush_tlb_multi(cpus, info);
@@ -112,6 +118,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
goto do_ex_hypercall;
for_each_cpu(cpu, cpus) {
+ if (do_lazy && cpu_is_lazy(cpu))
+ continue;
vcpu = hv_cpu_number_to_vp_number(cpu);
if (vcpu == VP_INVAL) {
local_irq_restore(flags);
@@ -198,7 +206,8 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.valid_bank_mask = 0;
flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
+ nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus,
+ info->freed_tables ? NULL : cpu_is_lazy);
if (nr_bank < 0)
return HV_STATUS_INVALID_PARAMETER;
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e481056698de..333556a86b2a 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,5 @@
# Makefile for the ia32 kernel emulation subsystem.
#
-obj-$(CONFIG_IA32_EMULATION) := ia32_signal.o
-
audit-class-$(CONFIG_AUDIT) := audit.o
obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 1e51650b79d7..4f1ce5fc4e19 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
+generated-y += orc_hash.h
generated-y += syscalls_32.h
generated-y += syscalls_64.h
generated-y += syscalls_x32.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 65064d9f7fa6..8eb74cf386db 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -14,6 +14,7 @@
#include <asm/mmu.h>
#include <asm/mpspec.h>
#include <asm/x86_init.h>
+#include <asm/cpufeature.h>
#ifdef CONFIG_ACPI_APEI
# include <asm/pgtable_types.h>
@@ -63,6 +64,13 @@ extern int (*acpi_suspend_lowlevel)(void);
/* Physical address to resume after wakeup */
unsigned long acpi_get_wakeup_address(void);
+static inline bool acpi_skip_set_wakeup_address(void)
+{
+ return cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+
+#define acpi_skip_set_wakeup_address acpi_skip_set_wakeup_address
+
/*
* Check if the CPU can handle C2 and deeper
*/
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index cd7b14322035..c8c111d8fbd7 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -23,10 +23,4 @@
*/
#define flush_agp_cache() wbinvd()
-/* GATT allocation. Returns/accepts GATT kernel virtual address. */
-#define alloc_gatt_pages(order) \
- ((char *)__get_free_pages(GFP_KERNEL, (order)))
-#define free_gatt_pages(table, order) \
- free_pages((unsigned long)(table), (order))
-
#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 9542c582d546..d7da28fada87 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -6,8 +6,10 @@
#include <linux/stringify.h>
#include <asm/asm.h>
-#define ALTINSTR_FLAG_INV (1 << 15)
-#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
+#define ALT_FLAGS_SHIFT 16
+
+#define ALT_FLAG_NOT (1 << 0)
+#define ALT_NOT(feature) ((ALT_FLAG_NOT << ALT_FLAGS_SHIFT) | (feature))
#ifndef __ASSEMBLY__
@@ -59,10 +61,27 @@
".long 999b - .\n\t" \
".popsection\n\t"
+/*
+ * The patching flags are part of the upper bits of the @ft_flags parameter when
+ * specifying them. The split is currently like this:
+ *
+ * [31... flags ...16][15... CPUID feature bit ...0]
+ *
+ * but since this is all hidden in the macros argument being split, those fields can be
+ * extended in the future to fit in a u64 or however the need arises.
+ */
struct alt_instr {
s32 instr_offset; /* original instruction */
s32 repl_offset; /* offset to replacement instruction */
- u16 cpuid; /* cpuid bit set for replacement */
+
+ union {
+ struct {
+ u32 cpuid: 16; /* CPUID bit set for replacement */
+ u32 flags: 16; /* patching control flags */
+ };
+ u32 ft_flags;
+ };
+
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction */
} __packed;
@@ -78,8 +97,43 @@ extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
extern void apply_retpolines(s32 *start, s32 *end);
extern void apply_returns(s32 *start, s32 *end);
extern void apply_ibt_endbr(s32 *start, s32 *end);
+extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine,
+ s32 *start_cfi, s32 *end_cfi);
struct module;
+struct paravirt_patch_site;
+
+struct callthunk_sites {
+ s32 *call_start, *call_end;
+ struct paravirt_patch_site *pv_start, *pv_end;
+};
+
+#ifdef CONFIG_CALL_THUNKS
+extern void callthunks_patch_builtin_calls(void);
+extern void callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod);
+extern void *callthunks_translate_call_dest(void *dest);
+extern bool is_callthunk(void *addr);
+extern int x86_call_depth_emit_accounting(u8 **pprog, void *func);
+#else
+static __always_inline void callthunks_patch_builtin_calls(void) {}
+static __always_inline void
+callthunks_patch_module_calls(struct callthunk_sites *sites,
+ struct module *mod) {}
+static __always_inline void *callthunks_translate_call_dest(void *dest)
+{
+ return dest;
+}
+static __always_inline bool is_callthunk(void *addr)
+{
+ return false;
+}
+static __always_inline int x86_call_depth_emit_accounting(u8 **pprog,
+ void *func)
+{
+ return 0;
+}
+#endif
#ifdef CONFIG_SMP
extern void alternatives_smp_module_add(struct module *mod, char *name,
@@ -147,10 +201,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
" - (" alt_slen ")), 0x90\n" \
alt_end_marker ":\n"
-#define ALTINSTR_ENTRY(feature, num) \
+#define ALTINSTR_ENTRY(ft_flags, num) \
" .long 661b - .\n" /* label */ \
" .long " b_replacement(num)"f - .\n" /* new instruction */ \
- " .word " __stringify(feature) "\n" /* feature bit */ \
+ " .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
" .byte " alt_total_slen "\n" /* source len */ \
" .byte " alt_rlen(num) "\n" /* replacement len */
@@ -159,20 +213,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
/* alternative assembly primitive: */
-#define ALTERNATIVE(oldinstr, newinstr, feature) \
+#define ALTERNATIVE(oldinstr, newinstr, ft_flags) \
OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feature, 1) \
+ ALTINSTR_ENTRY(ft_flags, 1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, 1) \
".popsection\n"
-#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
+#define ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feature1, 1) \
- ALTINSTR_ENTRY(feature2, 2) \
+ ALTINSTR_ENTRY(ft_flags1, 1) \
+ ALTINSTR_ENTRY(ft_flags2, 2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, 1) \
@@ -180,21 +234,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
".popsection\n"
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
- newinstr_yes, feature)
-
-#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
- OLDINSTR_3(oldinsn, 1, 2, 3) \
- ".pushsection .altinstructions,\"a\"\n" \
- ALTINSTR_ENTRY(feat1, 1) \
- ALTINSTR_ENTRY(feat2, 2) \
- ALTINSTR_ENTRY(feat3, 3) \
- ".popsection\n" \
- ".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, 3) \
+ newinstr_yes, ft_flags)
+
+#define ALTERNATIVE_3(oldinsn, newinsn1, ft_flags1, newinsn2, ft_flags2, \
+ newinsn3, ft_flags3) \
+ OLDINSTR_3(oldinsn, 1, 2, 3) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(ft_flags1, 1) \
+ ALTINSTR_ENTRY(ft_flags2, 2) \
+ ALTINSTR_ENTRY(ft_flags3, 3) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(newinsn1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, 3) \
".popsection\n"
/*
@@ -209,14 +264,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
* For non barrier like inlines please define new variants
* without volatile and memory clobber.
*/
-#define alternative(oldinstr, newinstr, feature) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative(oldinstr, newinstr, ft_flags) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) : : : "memory")
-#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2) ::: "memory")
-#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
- asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+#define alternative_ternary(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
+ asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) ::: "memory")
/*
* Alternative inline assembly with input.
@@ -226,8 +281,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Argument numbers start with 1.
* Leaving an unused argument 0 to keep API compatibility.
*/
-#define alternative_input(oldinstr, newinstr, feature, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_input(oldinstr, newinstr, ft_flags, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: : "i" (0), ## input)
/*
@@ -238,20 +293,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, newinstr1 is used.
* Otherwise, oldinstr is used.
*/
-#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
- feature2, input...) \
- asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
- newinstr2, feature2) \
+#define alternative_input_2(oldinstr, newinstr1, ft_flags1, newinstr2, \
+ ft_flags2, input...) \
+ asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, ft_flags1, \
+ newinstr2, ft_flags2) \
: : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
-#define alternative_io(oldinstr, newinstr, feature, output, input...) \
- asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+#define alternative_io(oldinstr, newinstr, ft_flags, output, input...) \
+ asm_inline volatile (ALTERNATIVE(oldinstr, newinstr, ft_flags) \
: output : "i" (0), ## input)
/* Like alternative_io, but for replacing a direct call with another one. */
-#define alternative_call(oldfunc, newfunc, feature, output, input...) \
- asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
+#define alternative_call(oldfunc, newfunc, ft_flags, output, input...) \
+ asm_inline volatile (ALTERNATIVE("call %P[old]", "call %P[new]", ft_flags) \
: output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
/*
@@ -260,10 +315,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
* Otherwise, if CPU has feature1, function1 is used.
* Otherwise, old function is used.
*/
-#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
+#define alternative_call_2(oldfunc, newfunc1, ft_flags1, newfunc2, ft_flags2, \
output, input...) \
- asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
- "call %P[new2]", feature2) \
+ asm_inline volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", ft_flags1,\
+ "call %P[new2]", ft_flags2) \
: output, ASM_CALL_CONSTRAINT \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input)
@@ -312,10 +367,10 @@ static inline int alternatives_text_reserved(void *start, void *end)
* enough information for the alternatives patching code to patch an
* instruction. See apply_alternatives().
*/
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstr_entry orig alt ft_flags orig_len alt_len
.long \orig - .
.long \alt - .
- .word \feature
+ .4byte \ft_flags
.byte \orig_len
.byte \alt_len
.endm
@@ -326,7 +381,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
* @newinstr. ".skip" directive takes care of proper instruction padding
* in case @newinstr is longer than @oldinstr.
*/
-.macro ALTERNATIVE oldinstr, newinstr, feature
+.macro ALTERNATIVE oldinstr, newinstr, ft_flags
140:
\oldinstr
141:
@@ -334,7 +389,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
142:
.pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
+ altinstr_entry 140b,143f,\ft_flags,142b-140b,144f-143f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -347,6 +402,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define old_len 141b-140b
#define new_len1 144f-143f
#define new_len2 145f-144f
+#define new_len3 146f-145f
/*
* gas compatible max based on the idea from:
@@ -354,7 +410,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
*
* The additional "-" is needed because gas uses a "true" value of -1.
*/
-#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+#define alt_max_2(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+#define alt_max_3(a, b, c) (alt_max_2(alt_max_2(a, b), c))
/*
@@ -362,17 +419,40 @@ static inline int alternatives_text_reserved(void *start, void *end)
* has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
* @feature2, it replaces @oldinstr with @feature2.
*/
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+.macro ALTERNATIVE_2 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_2(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_2(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+ altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+.macro ALTERNATIVE_3 oldinstr, newinstr1, ft_flags1, newinstr2, ft_flags2, newinstr3, ft_flags3
140:
\oldinstr
141:
- .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+ .skip -((alt_max_3(new_len1, new_len2, new_len3) - (old_len)) > 0) * \
+ (alt_max_3(new_len1, new_len2, new_len3) - (old_len)),0x90
142:
.pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ altinstr_entry 140b,143f,\ft_flags1,142b-140b,144f-143f
+ altinstr_entry 140b,144f,\ft_flags2,142b-140b,145f-144f
+ altinstr_entry 140b,145f,\ft_flags3,142b-140b,146f-145f
.popsection
.pushsection .altinstr_replacement,"ax"
@@ -381,13 +461,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
144:
\newinstr2
145:
+ \newinstr3
+146:
.popsection
.endm
/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
-#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+#define ALTERNATIVE_TERNARY(oldinstr, ft_flags, newinstr_yes, newinstr_no) \
ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
- newinstr_yes, feature
+ newinstr_yes, ft_flags
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3415321c8240..3216da7074ba 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -249,7 +249,6 @@ static inline u64 native_x2apic_icr_read(void)
extern int x2apic_mode;
extern int x2apic_phys;
extern void __init x2apic_set_max_apicid(u32 apicid);
-extern void __init check_x2apic(void);
extern void x2apic_setup(void);
static inline int x2apic_enabled(void)
{
@@ -258,13 +257,13 @@ static inline int x2apic_enabled(void)
#define x2apic_supported() (boot_cpu_has(X86_FEATURE_X2APIC))
#else /* !CONFIG_X86_X2APIC */
-static inline void check_x2apic(void) { }
static inline void x2apic_setup(void) { }
static inline int x2apic_enabled(void) { return 0; }
#define x2apic_mode (0)
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
+extern void __init check_x2apic(void);
struct irq_data;
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 8f80de627c60..b1a98fa38828 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -12,6 +12,7 @@
#include <asm/special_insns.h>
#include <asm/preempt.h>
#include <asm/asm.h>
+#include <asm/gsseg.h>
#ifndef CONFIG_X86_CMPXCHG64
extern void cmpxchg8b_emu(void);
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 5efd01b548d1..808b4eece251 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -71,7 +71,7 @@ ATOMIC64_DECL(add_unless);
* the old value.
*/
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
{
return arch_cmpxchg64(&v->counter, o, n);
}
@@ -85,7 +85,7 @@ static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
* Atomically xchgs the value of @v to @n and returns
* the old value.
*/
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
unsigned high = (unsigned)(n >> 32);
@@ -104,7 +104,7 @@ static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
*
* Atomically sets the value of @v to @n.
*/
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
unsigned high = (unsigned)(i >> 32);
unsigned low = (unsigned)i;
@@ -119,7 +119,7 @@ static inline void arch_atomic64_set(atomic64_t *v, s64 i)
*
* Atomically reads the value of @v and returns it.
*/
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
s64 r;
alternative_atomic64(read, "=&A" (r), "c" (v) : "memory");
@@ -133,7 +133,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
*
* Atomically adds @i to @v and returns @i + *@v
*/
-static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
{
alternative_atomic64(add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -145,7 +145,7 @@ static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
/*
* Other variants with different arithmetic operators:
*/
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
alternative_atomic64(sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -154,7 +154,7 @@ static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
-static inline s64 arch_atomic64_inc_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_inc_return(atomic64_t *v)
{
s64 a;
alternative_atomic64(inc_return, "=&A" (a),
@@ -163,7 +163,7 @@ static inline s64 arch_atomic64_inc_return(atomic64_t *v)
}
#define arch_atomic64_inc_return arch_atomic64_inc_return
-static inline s64 arch_atomic64_dec_return(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
{
s64 a;
alternative_atomic64(dec_return, "=&A" (a),
@@ -179,7 +179,7 @@ static inline s64 arch_atomic64_dec_return(atomic64_t *v)
*
* Atomically adds @i to @v.
*/
-static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
{
__alternative_atomic64(add, add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -194,7 +194,7 @@ static inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
{
__alternative_atomic64(sub, sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
@@ -208,7 +208,7 @@ static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
*
* Atomically increments @v by 1.
*/
-static inline void arch_atomic64_inc(atomic64_t *v)
+static __always_inline void arch_atomic64_inc(atomic64_t *v)
{
__alternative_atomic64(inc, inc_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
@@ -221,7 +221,7 @@ static inline void arch_atomic64_inc(atomic64_t *v)
*
* Atomically decrements @v by 1.
*/
-static inline void arch_atomic64_dec(atomic64_t *v)
+static __always_inline void arch_atomic64_dec(atomic64_t *v)
{
__alternative_atomic64(dec, dec_return, /* no output */,
"S" (v) : "memory", "eax", "ecx", "edx");
@@ -237,7 +237,7 @@ static inline void arch_atomic64_dec(atomic64_t *v)
* Atomically adds @a to @v, so long as it was not @u.
* Returns non-zero if the add was done, zero otherwise.
*/
-static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
+static __always_inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
unsigned low = (unsigned)u;
unsigned high = (unsigned)(u >> 32);
@@ -248,7 +248,7 @@ static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
}
#define arch_atomic64_add_unless arch_atomic64_add_unless
-static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
+static __always_inline int arch_atomic64_inc_not_zero(atomic64_t *v)
{
int r;
alternative_atomic64(inc_not_zero, "=&a" (r),
@@ -257,7 +257,7 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v)
}
#define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero
-static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
+static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
{
s64 r;
alternative_atomic64(dec_if_positive, "=&A" (r),
@@ -269,7 +269,7 @@ static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -277,7 +277,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -288,7 +288,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -296,7 +296,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -307,7 +307,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -315,7 +315,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
c = old;
}
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
s64 old, c = 0;
@@ -326,7 +326,7 @@ static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
s64 old, c = 0;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 7886d0578fc9..c496595bf601 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -17,7 +17,7 @@
* Atomically reads the value of @v.
* Doesn't imply a read memory barrier.
*/
-static inline s64 arch_atomic64_read(const atomic64_t *v)
+static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
{
return __READ_ONCE((v)->counter);
}
@@ -29,7 +29,7 @@ static inline s64 arch_atomic64_read(const atomic64_t *v)
*
* Atomically sets the value of @v to @i.
*/
-static inline void arch_atomic64_set(atomic64_t *v, s64 i)
+static __always_inline void arch_atomic64_set(atomic64_t *v, s64 i)
{
__WRITE_ONCE(v->counter, i);
}
@@ -55,7 +55,7 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
*
* Atomically subtracts @i from @v.
*/
-static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "subq %1,%0"
: "=m" (v->counter)
@@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(s64 i, atomic64_t *v)
* true if the result is zero, or false for all
* other cases.
*/
-static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
}
@@ -113,7 +113,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
* returns true if the result is 0, or false for all other
* cases.
*/
-static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_dec_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
}
@@ -127,7 +127,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
* and returns true if the result is zero, or false for all
* other cases.
*/
-static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
+static __always_inline bool arch_atomic64_inc_and_test(atomic64_t *v)
{
return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
}
@@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
* if the result is negative, or false when
* result is greater than or equal to zero.
*/
-static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
+static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v)
{
return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i);
}
@@ -161,25 +161,25 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return
-static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
{
return arch_atomic64_add_return(-i, v);
}
#define arch_atomic64_sub_return arch_atomic64_sub_return
-static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
return xadd(&v->counter, i);
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add
-static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
return xadd(&v->counter, -i);
}
#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
-static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
return arch_cmpxchg(&v->counter, old, new);
}
@@ -191,13 +191,13 @@ static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s
}
#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
-static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
{
return arch_xchg(&v->counter, new);
}
#define arch_atomic64_xchg arch_atomic64_xchg
-static inline void arch_atomic64_and(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "andq %1,%0"
: "+m" (v->counter)
@@ -205,7 +205,7 @@ static inline void arch_atomic64_and(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
@@ -215,7 +215,7 @@ static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and
-static inline void arch_atomic64_or(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "orq %1,%0"
: "+m" (v->counter)
@@ -223,7 +223,7 @@ static inline void arch_atomic64_or(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
@@ -233,7 +233,7 @@ static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or
-static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
asm volatile(LOCK_PREFIX "xorq %1,%0"
: "+m" (v->counter)
@@ -241,7 +241,7 @@ static inline void arch_atomic64_xor(s64 i, atomic64_t *v)
: "memory");
}
-static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
+static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
s64 val = arch_atomic64_read(v);
diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h
index 53e9b0620d96..d90ae472fb76 100644
--- a/arch/x86/include/asm/bootparam_utils.h
+++ b/arch/x86/include/asm/bootparam_utils.h
@@ -38,7 +38,7 @@ static void sanitize_boot_params(struct boot_params *boot_params)
* IMPORTANT NOTE TO BOOTLOADER AUTHORS: do not simply clear
* this field. The purpose of this field is to guarantee
* compliance with the x86 boot spec located in
- * Documentation/x86/boot.rst . That spec says that the
+ * Documentation/arch/x86/boot.rst . That spec says that the
* *whole* structure should be cleared, after which only the
* portion defined by struct setup_header (boot_params->hdr)
* should be copied in.
diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h
index 86b2e0dcc4bf..ce9685fc78d8 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -2,7 +2,20 @@
#ifndef _ASM_X86_CACHEINFO_H
#define _ASM_X86_CACHEINFO_H
+/* Kernel controls MTRR and/or PAT MSRs. */
+extern unsigned int memory_caching_control;
+#define CACHE_MTRR 0x01
+#define CACHE_PAT 0x02
+
void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu);
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu);
+void cache_disable(void);
+void cache_enable(void);
+void set_cache_aps_delayed_init(bool val);
+bool get_cache_aps_delayed_init(void);
+void cache_bp_init(void);
+void cache_bp_restore(void);
+void cache_aps_init(void);
+
#endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 407beebadaf4..4d4a47a3a8ab 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -9,7 +9,6 @@
*/
#include <linux/compiler.h>
-#include <linux/uaccess.h>
#include <asm/byteorder.h>
/**
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 94fbe6ae7431..540573f515b7 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -221,9 +221,15 @@ extern void __add_wrong_size(void)
#define __try_cmpxchg(ptr, pold, new, size) \
__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+#define __try_cmpxchg_local(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), "")
+
#define arch_try_cmpxchg(ptr, pold, new) \
__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+#define arch_try_cmpxchg_local(ptr, pold, new) \
+ __try_cmpxchg_local((ptr), (pold), (new), sizeof(*(ptr)))
+
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
* value of "*ptr".
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 215f5a65790f..6ba80ce9438d 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -7,34 +7,6 @@
* you need to test for the feature in boot_cpu_data.
*/
-/*
- * CMPXCHG8B only writes to the target if we had the previous
- * value in registers, otherwise it acts as a read and gives us the
- * "new previous" value. That is why there is a loop. Preloading
- * EDX:EAX is a performance optimization: in the common case it means
- * we need only one locked operation.
- *
- * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
- * least an FPU save and/or %cr0.ts manipulation.
- *
- * cmpxchg8b must be used with the lock prefix here to allow the
- * instruction to be executed atomically. We need to have the reader
- * side to see the coherent 64bit value.
- */
-static inline void set_64bit(volatile u64 *ptr, u64 value)
-{
- u32 low = value;
- u32 high = value >> 32;
- u64 prev = *ptr;
-
- asm volatile("\n1:\t"
- LOCK_PREFIX "cmpxchg8b %0\n\t"
- "jnz 1b"
- : "=m" (*ptr), "+A" (prev)
- : "b" (low), "c" (high)
- : "memory");
-}
-
#ifdef CONFIG_X86_CMPXCHG64
#define arch_cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 250187ac8248..0d3beb27b7fe 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -2,11 +2,6 @@
#ifndef _ASM_X86_CMPXCHG_64_H
#define _ASM_X86_CMPXCHG_64_H
-static inline void set_64bit(volatile u64 *ptr, u64 val)
-{
- *ptr = val;
-}
-
#define arch_cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 3d98c3a60d34..eb08796002f3 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -7,17 +7,33 @@
enum cc_vendor {
CC_VENDOR_NONE,
CC_VENDOR_AMD,
- CC_VENDOR_HYPERV,
CC_VENDOR_INTEL,
};
-void cc_set_vendor(enum cc_vendor v);
-void cc_set_mask(u64 mask);
-
#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+extern enum cc_vendor cc_vendor;
+
+static inline enum cc_vendor cc_get_vendor(void)
+{
+ return cc_vendor;
+}
+
+static inline void cc_set_vendor(enum cc_vendor vendor)
+{
+ cc_vendor = vendor;
+}
+
+void cc_set_mask(u64 mask);
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
#else
+static inline enum cc_vendor cc_get_vendor(void)
+{
+ return CC_VENDOR_NONE;
+}
+
+static inline void cc_set_vendor(enum cc_vendor vendor) { }
+
static inline u64 cc_mkenc(u64 val)
{
return val;
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b472ef76826a..78796b98a544 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -95,5 +95,7 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
}
extern u64 x86_read_arch_cap_msr(void);
+int intel_find_matching_signature(void *mc, unsigned int csig, int cpf);
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 75efc4c6f076..462fc34f1317 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -130,10 +130,6 @@ struct cpu_entry_area {
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
-#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
-
-/* Total size includes the readonly IDT mapping page as well: */
-#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1a85e1fb0922..ce0c8f7d3218 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -32,6 +32,7 @@ enum cpuid_leafs
CPUID_8000_0007_EBX,
CPUID_7_EDX,
CPUID_8000_001F_EAX,
+ CPUID_8000_0021_EAX,
};
#define X86_CAP_FMT_NUM "%d:%d"
@@ -94,8 +95,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \
REQUIRED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 20))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 21))
#define DISABLED_MASK_BIT_SET(feature_bit) \
( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
@@ -118,8 +120,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \
+ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \
DISABLED_MASK_CHECK || \
- BUILD_BUG_ON_ZERO(NCAPINTS != 20))
+ BUILD_BUG_ON_ZERO(NCAPINTS != 21))
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b71f4f2ecdd5..cb8ca46213be 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,7 +13,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 20 /* N 32-bit words worth of info */
+#define NCAPINTS 21 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -97,7 +97,7 @@
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+/* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
@@ -226,10 +226,9 @@
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
@@ -304,10 +303,24 @@
#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */
#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */
#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
+#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
+#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
+#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
+#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
+#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
+#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
+#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
+#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
+#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
+#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
@@ -324,6 +337,7 @@
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
+#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
@@ -356,6 +370,7 @@
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
+#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -420,6 +435,13 @@
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
+/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
+#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
+#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
+#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
+#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
+#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
+
/*
* BUG word(s)
*/
@@ -460,5 +482,6 @@
#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
+#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index 70b2db18165e..9bee3e7bf973 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -1,13 +1,132 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* CPUID-related helpers/definitions
- *
- * Derived from arch/x86/kvm/cpuid.c
*/
#ifndef _ASM_X86_CPUID_H
#define _ASM_X86_CPUID_H
+#include <asm/string.h>
+
+struct cpuid_regs {
+ u32 eax, ebx, ecx, edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
+#ifdef CONFIG_X86_32
+extern int have_cpuid_p(void);
+#else
+static inline int have_cpuid_p(void)
+{
+ return 1;
+}
+#endif
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+#define native_cpuid_reg(reg) \
+static inline unsigned int native_cpuid_##reg(unsigned int op) \
+{ \
+ unsigned int eax = op, ebx, ecx = 0, edx; \
+ \
+ native_cpuid(&eax, &ebx, &ecx, &edx); \
+ \
+ return reg; \
+}
+
+/*
+ * Native CPUID functions returning a single datum.
+ */
+native_cpuid_reg(eax)
+native_cpuid_reg(ebx)
+native_cpuid_reg(ecx)
+native_cpuid_reg(edx)
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+#define __cpuid native_cpuid
+#endif
+
+/*
+ * Generic CPUID function
+ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
+ * resulting in stale register contents being returned.
+ */
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/* Some CPUID calls want 'count' to be placed in ecx */
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * CPUID functions returning a single datum
+ */
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return eax;
+}
+
+static inline unsigned int cpuid_ebx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ebx;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+
+ return edx;
+}
+
static __always_inline bool cpuid_function_is_indexed(u32 function)
{
switch (function) {
@@ -31,4 +150,22 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
return false;
}
+#define for_each_possible_hypervisor_cpuid_base(function) \
+ for (function = 0x40000000; function < 0x40010000; function += 0x100)
+
+static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
+{
+ uint32_t base, eax, signature[3];
+
+ for_each_possible_hypervisor_cpuid_base(base) {
+ cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
+
+ if (!memcmp(sig, signature, 12) &&
+ (leaves == 0 || ((eax - base) >= leaves)))
+ return base;
+ }
+
+ return 0;
+}
+
#endif /* _ASM_X86_CPUID_H */
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..a1168e7b69e5 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -3,16 +3,42 @@
#define _ASM_X86_CURRENT_H
#include <linux/compiler.h>
-#include <asm/percpu.h>
#ifndef __ASSEMBLY__
+
+#include <linux/cache.h>
+#include <asm/percpu.h>
+
struct task_struct;
-DECLARE_PER_CPU(struct task_struct *, current_task);
+struct pcpu_hot {
+ union {
+ struct {
+ struct task_struct *current_task;
+ int preempt_count;
+ int cpu_number;
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ u64 call_depth;
+#endif
+ unsigned long top_of_stack;
+ void *hardirq_stack_ptr;
+ u16 softirq_pending;
+#ifdef CONFIG_X86_64
+ bool hardirq_stack_inuse;
+#else
+ void *softirq_stack_ptr;
+#endif
+ };
+ u8 pad[64];
+ };
+};
+static_assert(sizeof(struct pcpu_hot) == 64);
+
+DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
static __always_inline struct task_struct *get_current(void)
{
- return this_cpu_read_stable(current_task);
+ return this_cpu_read_stable(pcpu_hot.current_task);
}
#define current get_current()
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index cfdf307ddc01..66eb5e1ac4fb 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -2,8 +2,8 @@
#ifndef _ASM_X86_DEBUGREG_H
#define _ASM_X86_DEBUGREG_H
-
#include <linux/bug.h>
+#include <linux/percpu.h>
#include <uapi/asm/debugreg.h>
DECLARE_PER_CPU(unsigned long, cpu_dr7);
@@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno)
asm("mov %%db6, %0" :"=r" (val));
break;
case 7:
- asm("mov %%db7, %0" :"=r" (val));
+ /*
+ * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them
+ * with other code.
+ *
+ * This is needed because a DR7 access can cause a #VC exception
+ * when running under SEV-ES. Taking a #VC exception is not a
+ * safe thing to do just anywhere in the entry code and
+ * re-ordering might place the access into an unsafe location.
+ *
+ * This happened in the NMI handler, where the DR7 read was
+ * re-ordered to happen before the call to sev_es_ist_enter(),
+ * causing stack recursion.
+ */
+ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER);
break;
default:
BUG();
@@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value)
asm("mov %0, %%db6" ::"r" (value));
break;
case 7:
- asm("mov %0, %%db7" ::"r" (value));
+ /*
+ * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them
+ * with other code.
+ *
+ * While is didn't happen with a DR7 write (see the DR7 read
+ * comment above which explains where it happened), add the
+ * __FORCE_ORDER here too to avoid similar problems in the
+ * future.
+ */
+ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER);
break;
default:
BUG();
@@ -126,9 +148,14 @@ static __always_inline void local_db_restore(unsigned long dr7)
}
#ifdef CONFIG_CPU_SUP_AMD
-extern void set_dr_addr_mask(unsigned long mask, int dr);
+extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr);
+extern unsigned long amd_get_dr_addr_mask(unsigned int dr);
#else
-static inline void set_dr_addr_mask(unsigned long mask, int dr) { }
+static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { }
+static inline unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ return 0;
+}
#endif
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 33d2cd04d254..fafe9be7a6f4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -69,6 +69,18 @@
# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31))
#endif
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+# define DISABLE_CALL_DEPTH_TRACKING 0
+#else
+# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
+#endif
+
+#ifdef CONFIG_ADDRESS_MASKING
+# define DISABLE_LAM 0
+#else
+# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31))
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
# define DISABLE_ENQCMD 0
#else
@@ -81,6 +93,12 @@
# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31))
#endif
+#ifdef CONFIG_XEN_PV
+# define DISABLE_XENPV 0
+#else
+# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31))
+#endif
+
#ifdef CONFIG_INTEL_TDX_GUEST
# define DISABLE_TDX_GUEST 0
#else
@@ -98,11 +116,12 @@
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
-#define DISABLED_MASK8 (DISABLE_TDX_GUEST)
+#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST)
#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
-#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET)
-#define DISABLED_MASK12 0
+#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
+ DISABLE_CALL_DEPTH_TRACKING)
+#define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
@@ -111,6 +130,7 @@
#define DISABLED_MASK17 0
#define DISABLED_MASK18 0
#define DISABLED_MASK19 0
-#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define DISABLED_MASK20 0
+#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1c66708e3062..d1dac96ee30b 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -4,7 +4,7 @@
extern const struct dma_map_ops *dma_ops;
-static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
+static inline const struct dma_map_ops *get_arch_dma_ops(void)
{
return dma_ops;
}
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 233ae6986d6f..419280d263d2 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -106,6 +106,8 @@ static inline void efi_fpu_end(void)
extern asmlinkage u64 __efi_call(void *fp, ...);
+extern bool efi_disable_ibt_for_runtime;
+
#define efi_call(...) ({ \
__efi_nargs_check(efi_call, 7, __VA_ARGS__); \
__efi_call(__VA_ARGS__); \
@@ -121,7 +123,7 @@ extern asmlinkage u64 __efi_call(void *fp, ...);
#undef arch_efi_call_virt
#define arch_efi_call_virt(p, f, args...) ({ \
- u64 ret, ibt = ibt_save(); \
+ u64 ret, ibt = ibt_save(efi_disable_ibt_for_runtime); \
ret = efi_call((void *)p->f, args); \
ibt_restore(ibt); \
ret; \
@@ -178,7 +180,7 @@ struct efi_setup_data {
extern u64 efi_setup;
#ifdef CONFIG_EFI
-extern efi_status_t __efi64_thunk(u32, ...);
+extern u64 __efi64_thunk(u32, ...);
#define efi64_thunk(...) ({ \
u64 __pad[3]; /* must have space for 3 args on the stack */ \
@@ -228,16 +230,15 @@ static inline bool efi_is_native(void)
return efi_is_64bit();
}
-#define efi_mixed_mode_cast(attr) \
- __builtin_choose_expr( \
- __builtin_types_compatible_p(u32, __typeof__(attr)), \
- (unsigned long)(attr), (attr))
-
#define efi_table_attr(inst, attr) \
- (efi_is_native() \
- ? inst->attr \
- : (__typeof__(inst->attr)) \
- efi_mixed_mode_cast(inst->mixed_mode.attr))
+ (efi_is_native() ? (inst)->attr \
+ : efi_mixed_table_attr((inst), attr))
+
+#define efi_mixed_table_attr(inst, attr) \
+ (__typeof__(inst->attr)) \
+ _Generic(inst->mixed_mode.attr, \
+ u32: (unsigned long)(inst->mixed_mode.attr), \
+ default: (inst->mixed_mode.attr))
/*
* The following macros allow translating arguments if necessary from native to
@@ -325,6 +326,27 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi64_argmap_set_memory_space_attributes(phys, size, flags) \
(__efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+/* file protocol */
+#define __efi64_argmap_open(prot, newh, fname, mode, attr) \
+ ((prot), efi64_zero_upper(newh), (fname), __efi64_split(mode), \
+ __efi64_split(attr))
+
+#define __efi64_argmap_set_position(pos) (__efi64_split(pos))
+
+/* file system protocol */
+#define __efi64_argmap_open_volume(prot, file) \
+ ((prot), efi64_zero_upper(file))
+
+/* Memory Attribute Protocol */
+#define __efi64_argmap_get_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), (flags))
+
+#define __efi64_argmap_set_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
+#define __efi64_argmap_clear_memory_attributes(protocol, phys, size, flags) \
+ ((protocol), __efi64_split(phys), __efi64_split(size), __efi64_split(flags))
+
/*
* The macros below handle the plumbing for the argument mapping. To add a
* mapping for a specific EFI method, simply define a macro
@@ -344,31 +366,27 @@ static inline u32 efi64_convert_status(efi_status_t status)
#define __efi_eat(...)
#define __efi_eval(...) __VA_ARGS__
-/* The three macros below handle dispatching via the thunk if needed */
-
-#define efi_call_proto(inst, func, ...) \
- (efi_is_native() \
- ? inst->func(inst, ##__VA_ARGS__) \
- : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__))
+static inline efi_status_t __efi64_widen_efi_status(u64 status)
+{
+ /* use rotate to move the value of bit #31 into position #63 */
+ return ror64(rol32(status, 1), 1);
+}
-#define efi_bs_call(func, ...) \
- (efi_is_native() \
- ? efi_system_table->boottime->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_table_attr(efi_system_table, \
- boottime), \
- func, __VA_ARGS__))
+/* The macro below handles dispatching via the thunk if needed */
-#define efi_rt_call(func, ...) \
- (efi_is_native() \
- ? efi_system_table->runtime->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_table_attr(efi_system_table, \
- runtime), \
- func, __VA_ARGS__))
+#define efi_fn_call(inst, func, ...) \
+ (efi_is_native() ? (inst)->func(__VA_ARGS__) \
+ : efi_mixed_call((inst), func, ##__VA_ARGS__))
-#define efi_dxe_call(func, ...) \
- (efi_is_native() \
- ? efi_dxe_table->func(__VA_ARGS__) \
- : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__))
+#define efi_mixed_call(inst, func, ...) \
+ _Generic(inst->func(__VA_ARGS__), \
+ efi_status_t: \
+ __efi64_widen_efi_status( \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__)), \
+ u64: ({ BUILD_BUG(); ULONG_MAX; }), \
+ default: \
+ (__typeof__(inst->func(__VA_ARGS__))) \
+ __efi64_thunk_map(inst, func, ##__VA_ARGS__))
#else /* CONFIG_EFI_MIXED */
@@ -400,13 +418,52 @@ static inline void efi_reserve_boot_services(void)
#ifdef CONFIG_EFI_FAKE_MEMMAP
extern void __init efi_fake_memmap_early(void);
+extern void __init efi_fake_memmap(void);
#else
static inline void efi_fake_memmap_early(void)
{
}
+
+static inline void efi_fake_memmap(void)
+{
+}
#endif
+extern int __init efi_memmap_alloc(unsigned int num_entries,
+ struct efi_memory_map_data *data);
+extern void __efi_memmap_free(u64 phys, unsigned long size,
+ unsigned long flags);
+#define __efi_memmap_free __efi_memmap_free
+
+extern int __init efi_memmap_install(struct efi_memory_map_data *data);
+extern int __init efi_memmap_split_count(efi_memory_desc_t *md,
+ struct range *range);
+extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap,
+ void *buf, struct efi_mem_range *mem);
+
#define arch_ima_efi_boot_mode \
({ extern struct boot_params boot_params; boot_params.secure_boot; })
+#ifdef CONFIG_EFI_RUNTIME_MAP
+int efi_get_runtime_map_size(void);
+int efi_get_runtime_map_desc_size(void);
+int efi_runtime_map_copy(void *buf, size_t bufsz);
+#else
+static inline int efi_get_runtime_map_size(void)
+{
+ return 0;
+}
+
+static inline int efi_get_runtime_map_desc_size(void)
+{
+ return 0;
+}
+
+static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
+{
+ return 0;
+}
+
+#endif
+
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index cb0ff1055ab1..18fd06f7936a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -152,10 +152,6 @@ do { \
(elf_check_arch_ia32(x) || \
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
-#if __USER32_DS != __USER_DS
-# error "The following code assumes __USER32_DS == __USER_DS"
-#endif
-
static inline void elf_common_init(struct thread_struct *t,
struct pt_regs *regs, const u16 ds)
{
@@ -226,7 +222,6 @@ do { \
/* I'm not sure if we can use '-' here */
#define ELF_PLATFORM ("x86_64")
extern void set_personality_64bit(void);
-extern unsigned int sysctl_vsyscall32;
extern int force_personality32;
#endif /* !CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 674ed46d3ced..117903881fe4 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -24,8 +24,8 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs)
/*
* For !SMAP hardware we patch out CLAC on entry.
*/
- if (boot_cpu_has(X86_FEATURE_SMAP) ||
- (IS_ENABLED(CONFIG_64BIT) && boot_cpu_has(X86_FEATURE_XENPV)))
+ if (cpu_feature_enabled(X86_FEATURE_SMAP) ||
+ cpu_feature_enabled(X86_FEATURE_XENPV))
mask |= X86_EFLAGS_AC;
WARN_ON_ONCE(flags & mask);
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index b2486b2cbc6e..78fcde7b1f07 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -39,7 +39,7 @@ extern void fpu_flush_thread(void);
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (cpu_feature_enabled(X86_FEATURE_FPU) &&
- !(current->flags & PF_KTHREAD)) {
+ !(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
save_fpregs_to_fpstate(old_fpu);
/*
* The save operation preserved register state, so the
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index e1c9df9102a5..611fa41711af 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -13,16 +13,9 @@
#ifdef CONFIG_X86_64
# include <uapi/asm/sigcontext.h>
# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs);
#else
# define user_i387_ia32_struct user_i387_struct
# define user32_fxsr_struct user_fxsr_struct
-# define ia32_setup_frame __setup_frame
-# define ia32_setup_rt_frame __setup_rt_frame
#endif
extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index eb7cd1139d97..7f6d858ff47a 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -321,7 +321,7 @@ struct xstate_header {
struct xregs_state {
struct fxregs_state i387;
struct xstate_header header;
- u8 extended_state_area[0];
+ u8 extended_state_area[];
} __attribute__ ((packed, aligned (64)));
/*
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
index 9656a5bc6fea..9a710c060445 100644
--- a/arch/x86/include/asm/fpu/xcr.h
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -5,7 +5,7 @@
#define XCR_XFEATURE_ENABLED_MASK 0x00000000
#define XCR_XFEATURE_IN_USE_MASK 0x00000001
-static inline u64 xgetbv(u32 index)
+static __always_inline u64 xgetbv(u32 index)
{
u32 eax, edx;
@@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 value)
*
* Callers should check X86_FEATURE_XGETBV1.
*/
-static inline u64 xfeatures_in_use(void)
+static __always_inline u64 xfeatures_in_use(void)
{
return xgetbv(XCR_XFEATURE_IN_USE_MASK);
}
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 908d99b127d3..5061ac98ffa1 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -34,19 +34,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
return addr;
}
-/*
- * When a ftrace registered caller is tracing a function that is
- * also set by a register_ftrace_direct() call, it needs to be
- * differentiated in the ftrace_caller trampoline. To do this, we
- * place the direct caller in the ORIG_AX part of pt_regs. This
- * tells the ftrace_caller that there's a direct caller.
- */
-static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
-{
- /* Emulate a call */
- regs->orig_ax = addr;
-}
-
#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
struct ftrace_regs {
struct pt_regs regs;
@@ -61,9 +48,25 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
return &fregs->regs;
}
-#define ftrace_instruction_pointer_set(fregs, _ip) \
+#define ftrace_regs_set_instruction_pointer(fregs, _ip) \
do { (fregs)->regs.ip = (_ip); } while (0)
+#define ftrace_regs_get_instruction_pointer(fregs) \
+ ((fregs)->regs.ip)
+
+#define ftrace_regs_get_argument(fregs, n) \
+ regs_get_kernel_argument(&(fregs)->regs, n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+ kernel_stack_pointer(&(fregs)->regs)
+#define ftrace_regs_return_value(fregs) \
+ regs_return_value(&(fregs)->regs)
+#define ftrace_regs_set_return_value(fregs, ret) \
+ regs_set_return_value(&(fregs)->regs, ret)
+#define ftrace_override_function_with_return(fregs) \
+ override_function_with_return(&(fregs)->regs)
+#define ftrace_regs_query_register_offset(name) \
+ regs_query_register_offset(name)
+
struct ftrace_ops;
#define ftrace_graph_func ftrace_graph_func
void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
@@ -72,6 +75,24 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When a ftrace registered caller is tracing a function that is
+ * also set by a register_ftrace_direct() call, it needs to be
+ * differentiated in the ftrace_caller trampoline. To do this, we
+ * place the direct caller in the ORIG_AX part of pt_regs. This
+ * tells the ftrace_caller that there's a direct caller.
+ */
+static inline void
+__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+ /* Emulate a call */
+ regs->orig_ax = addr;
+}
+#define arch_ftrace_set_direct_caller(fregs, addr) \
+ __arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
#ifdef CONFIG_DYNAMIC_FTRACE
struct dyn_arch_ftrace {
diff --git a/arch/x86/include/asm/gsseg.h b/arch/x86/include/asm/gsseg.h
new file mode 100644
index 000000000000..ab6a595cea70
--- /dev/null
+++ b/arch/x86/include/asm/gsseg.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_GSSEG_H
+#define _ASM_X86_GSSEG_H
+
+#include <linux/types.h>
+
+#include <asm/asm.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative.h>
+#include <asm/processor.h>
+#include <asm/nops.h>
+
+#ifdef CONFIG_X86_64
+
+extern asmlinkage void asm_load_gs_index(u16 selector);
+
+/* Replace with "lkgs %di" once binutils support LKGS instruction */
+#define LKGS_DI _ASM_BYTES(0xf2,0x0f,0x00,0xf7)
+
+static inline void native_lkgs(unsigned int selector)
+{
+ u16 sel = selector;
+ asm_inline volatile("1: " LKGS_DI
+ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k[sel])
+ : [sel] "+D" (sel));
+}
+
+static inline void native_load_gs_index(unsigned int selector)
+{
+ if (cpu_feature_enabled(X86_FEATURE_LKGS)) {
+ native_lkgs(selector);
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ asm_load_gs_index(selector);
+ local_irq_restore(flags);
+ }
+}
+
+#endif /* CONFIG_X86_64 */
+
+static inline void __init lkgs_init(void)
+{
+#ifdef CONFIG_PARAVIRT_XXL
+#ifdef CONFIG_X86_64
+ if (cpu_feature_enabled(X86_FEATURE_LKGS))
+ pv_ops.cpu.load_gs_index = native_lkgs;
+#endif
+#endif
+}
+
+#ifndef CONFIG_PARAVIRT_XXL
+
+static inline void load_gs_index(unsigned int selector)
+{
+#ifdef CONFIG_X86_64
+ native_load_gs_index(selector);
+#else
+ loadsegment(gs, selector);
+#endif
+}
+
+#endif /* CONFIG_PARAVIRT_XXL */
+
+#endif /* _ASM_X86_GSSEG_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 275e7fd20310..66837b8c67f1 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -3,9 +3,9 @@
#define _ASM_X86_HARDIRQ_H
#include <linux/threads.h>
+#include <asm/current.h>
typedef struct {
- u16 __softirq_pending;
#if IS_ENABLED(CONFIG_KVM_INTEL)
u8 kvm_cpu_l1tf_flush_l1d;
#endif
@@ -60,6 +60,7 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
extern u64 arch_irq_stat(void);
#define arch_irq_stat arch_irq_stat
+#define local_softirq_pending_ref pcpu_hot.softirq_pending
#if IS_ENABLED(CONFIG_KVM_INTEL)
static inline void kvm_set_cpu_l1tf_flush_l1d(void)
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 3089ec352743..cea95dcd27c2 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -61,6 +61,8 @@
#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10)
/* Support for debug MSRs available */
#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11)
+/* Support for extended gva ranges for flush hypercalls available */
+#define HV_FEATURE_EXT_GVA_RANGES_FLUSH BIT(14)
/*
* Support for returning hypercall output block via XMM
* registers is available
@@ -114,9 +116,15 @@
/* Recommend using the newer ExProcessorMasks interface */
#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED BIT(11)
+/* Indicates that the hypervisor is nested within a Hyper-V partition. */
+#define HV_X64_HYPERV_NESTED BIT(12)
+
/* Recommend using enlightened VMCS */
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED BIT(14)
+/* Use hypercalls for MMIO config space access */
+#define HV_X64_USE_MMIO_HYPERCALLS BIT(21)
+
/*
* CPU management features identification.
* These are HYPERV_CPUID_CPU_MANAGEMENT_FEATURES.EAX bits.
@@ -223,6 +231,17 @@ enum hv_isolation_type {
#define HV_REGISTER_SINT15 0x4000009F
/*
+ * Define synthetic interrupt controller model specific registers for
+ * nested hypervisor.
+ */
+#define HV_REGISTER_NESTED_SCONTROL 0x40001080
+#define HV_REGISTER_NESTED_SVERSION 0x40001081
+#define HV_REGISTER_NESTED_SIEFP 0x40001082
+#define HV_REGISTER_NESTED_SIMP 0x40001083
+#define HV_REGISTER_NESTED_EOM 0x40001084
+#define HV_REGISTER_NESTED_SINT0 0x40001090
+
+/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
#define HV_REGISTER_STIMER0_CONFIG 0x400000B0
@@ -253,6 +272,9 @@ enum hv_isolation_type {
/* TSC invariant control */
#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118
+/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
+#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0)
+
/* Register name aliases for temporary compatibility */
#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
@@ -366,7 +388,8 @@ struct hv_nested_enlightenments_control {
__u32 reserved:31;
} features;
struct {
- __u32 reserved;
+ __u32 inter_partition_comm:1;
+ __u32 reserved:31;
} hypercallControls;
} __packed;
@@ -374,11 +397,20 @@ struct hv_nested_enlightenments_control {
struct hv_vp_assist_page {
__u32 apic_assist;
__u32 reserved1;
- __u64 vtl_control[3];
+ __u32 vtl_entry_reason;
+ __u32 vtl_reserved;
+ __u64 vtl_ret_x64rax;
+ __u64 vtl_ret_x64rcx;
struct hv_nested_enlightenments_control nested_control;
__u8 enlighten_vmentry;
__u8 reserved2[7];
__u64 current_nested_vmcs;
+ __u8 synthetic_time_unhalted_timer_expired;
+ __u8 reserved3[7];
+ __u8 virtualization_fault_information[40];
+ __u8 reserved4[8];
+ __u8 intercept_message[256];
+ __u8 vtl_ret_actions[256];
} __packed;
struct hv_enlightened_vmcs {
@@ -598,6 +630,41 @@ struct hv_enlightened_vmcs {
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF
+/*
+ * Note, Hyper-V isn't actually stealing bit 28 from Intel, just abusing it by
+ * pairing it with architecturally impossible exit reasons. Bit 28 is set only
+ * on SMI exits to a SMI transfer monitor (STM) and if and only if a MTF VM-Exit
+ * is pending. I.e. it will never be set by hardware for non-SMI exits (there
+ * are only three), nor will it ever be set unless the VMM is an STM.
+ */
+#define HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH 0x10000031
+
+/*
+ * Hyper-V uses the software reserved 32 bytes in VMCB control area to expose
+ * SVM enlightenments to guests.
+ */
+struct hv_vmcb_enlightenments {
+ struct __packed hv_enlightenments_control {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 enlightened_npt_tlb: 1;
+ u32 reserved:29;
+ } __packed hv_enlightenments_control;
+ u32 hv_vp_id;
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 reserved;
+} __packed;
+
+/*
+ * Hyper-V uses the software reserved clean bit in VMCB.
+ */
+#define HV_VMCB_NESTED_ENLIGHTENMENTS 31
+
+/* Synthetic VM-Exit */
+#define HV_SVM_EXITCODE_ENL 0xf0000000
+#define HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH (1)
+
struct hv_partition_assist_pg {
u32 tlb_lock_count;
};
@@ -649,6 +716,81 @@ union hv_msi_entry {
} __packed;
};
+struct hv_x64_segment_register {
+ u64 base;
+ u32 limit;
+ u16 selector;
+ union {
+ struct {
+ u16 segment_type : 4;
+ u16 non_system_segment : 1;
+ u16 descriptor_privilege_level : 2;
+ u16 present : 1;
+ u16 reserved : 4;
+ u16 available : 1;
+ u16 _long : 1;
+ u16 _default : 1;
+ u16 granularity : 1;
+ } __packed;
+ u16 attributes;
+ };
+} __packed;
+
+struct hv_x64_table_register {
+ u16 pad[3];
+ u16 limit;
+ u64 base;
+} __packed;
+
+struct hv_init_vp_context {
+ u64 rip;
+ u64 rsp;
+ u64 rflags;
+
+ struct hv_x64_segment_register cs;
+ struct hv_x64_segment_register ds;
+ struct hv_x64_segment_register es;
+ struct hv_x64_segment_register fs;
+ struct hv_x64_segment_register gs;
+ struct hv_x64_segment_register ss;
+ struct hv_x64_segment_register tr;
+ struct hv_x64_segment_register ldtr;
+
+ struct hv_x64_table_register idtr;
+ struct hv_x64_table_register gdtr;
+
+ u64 efer;
+ u64 cr0;
+ u64 cr3;
+ u64 cr4;
+ u64 msr_cr_pat;
+} __packed;
+
+union hv_input_vtl {
+ u8 as_uint8;
+ struct {
+ u8 target_vtl: 4;
+ u8 use_target_vtl: 1;
+ u8 reserved_z: 3;
+ };
+} __packed;
+
+struct hv_enable_vp_vtl {
+ u64 partition_id;
+ u32 vp_index;
+ union hv_input_vtl target_vtl;
+ u8 mbz0;
+ u16 mbz1;
+ struct hv_init_vp_context vp_context;
+} __packed;
+
+struct hv_get_vp_from_apic_id_in {
+ u64 partition_id;
+ union hv_input_vtl target_vtl;
+ u8 res[7];
+ u32 apic_ids[];
+} __packed;
+
#include <asm-generic/hyperv-tlfs.h>
#endif
diff --git a/arch/x86/include/asm/hyperv_timer.h b/arch/x86/include/asm/hyperv_timer.h
new file mode 100644
index 000000000000..388fa81b8f38
--- /dev/null
+++ b/arch/x86/include/asm/hyperv_timer.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_HYPERV_TIMER_H
+#define _ASM_X86_HYPERV_TIMER_H
+
+#include <asm/msr.h>
+
+#define hv_get_raw_timer() rdtsc_ordered()
+
+#endif
diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h
index 9b08082a5d9f..baae6b4fea23 100644
--- a/arch/x86/include/asm/ibt.h
+++ b/arch/x86/include/asm/ibt.h
@@ -74,7 +74,7 @@ static inline bool is_endbr(u32 val)
return val == gen_endbr();
}
-extern __noendbr u64 ibt_save(void);
+extern __noendbr u64 ibt_save(bool disable);
extern __noendbr void ibt_restore(u64 save);
#else /* __ASSEMBLY__ */
@@ -100,7 +100,7 @@ extern __noendbr void ibt_restore(u64 save);
static inline bool is_endbr(u32 val) { return false; }
-static inline u64 ibt_save(void) { return 0; }
+static inline u64 ibt_save(bool disable) { return 0; }
static inline void ibt_restore(u64 save) { }
#else /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..b241af4ce9b4 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
/* NMI */
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
/*
- * Special NOIST entry point for VMX which invokes this on the kernel
- * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
- * 'executing' marker.
- *
- * On 32bit this just uses the regular NMI entry point because 32-bit does
- * not have ISTs.
+ * Special entry point for VMX which invokes this on the kernel stack, even for
+ * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
+ * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
+ * to avoid more ifdeffery.
*/
-DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
-#else
-#define asm_exc_nmi_noist asm_exc_nmi
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
#endif
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index f07faa61c7f3..54368a43abf6 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -32,16 +32,16 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs,
bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE], int buf_size);
-enum mmio_type {
- MMIO_DECODE_FAILED,
- MMIO_WRITE,
- MMIO_WRITE_IMM,
- MMIO_READ,
- MMIO_READ_ZERO_EXTEND,
- MMIO_READ_SIGN_EXTEND,
- MMIO_MOVS,
+enum insn_mmio_type {
+ INSN_MMIO_DECODE_FAILED,
+ INSN_MMIO_WRITE,
+ INSN_MMIO_WRITE_IMM,
+ INSN_MMIO_READ,
+ INSN_MMIO_READ_ZERO_EXTEND,
+ INSN_MMIO_READ_SIGN_EXTEND,
+ INSN_MMIO_MOVS,
};
-enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
+enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 347707d459c6..b3af2d45bbbb 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -123,6 +123,10 @@
#define INTEL_FAM6_METEORLAKE 0xAC
#define INTEL_FAM6_METEORLAKE_L 0xAA
+#define INTEL_FAM6_LUNARLAKE_M 0xBD
+
+#define INTEL_FAM6_ARROWLAKE 0xC6
+
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index c201083b34f6..a3abdcd89a32 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -20,25 +20,4 @@ extern void intel_mid_pwr_power_off(void);
extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
-#ifdef CONFIG_X86_INTEL_MID
-
-extern void intel_scu_devices_create(void);
-extern void intel_scu_devices_destroy(void);
-
-#else /* !CONFIG_X86_INTEL_MID */
-
-static inline void intel_scu_devices_create(void) { }
-static inline void intel_scu_devices_destroy(void) { }
-
-#endif /* !CONFIG_X86_INTEL_MID */
-
-/* Bus Select SoC Fuse value */
-#define BSEL_SOC_FUSE_MASK 0x7
-/* FSB 133MHz */
-#define BSEL_SOC_FUSE_001 0x1
-/* FSB 100MHz */
-#define BSEL_SOC_FUSE_101 0x5
-/* FSB 83MHz */
-#define BSEL_SOC_FUSE_111 0x7
-
#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 7cc49432187f..7a2ed154a5e1 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -44,10 +44,6 @@ extern int irq_remapping_reenable(int);
extern int irq_remap_enable_fault_handling(void);
extern void panic_if_irq_remap(const char *msg);
-/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
-extern struct irq_domain *
-arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id);
-
/* Get parent irqdomain for interrupt remapping irqdomain */
static inline struct irq_domain *arch_get_ir_parent_domain(void)
{
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 147cb8fdda92..798183867d78 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -116,7 +116,7 @@
ASM_CALL_ARG2
#define call_on_irqstack(func, asm_call, argconstr...) \
- call_on_stack(__this_cpu_read(hardirq_stack_ptr), \
+ call_on_stack(__this_cpu_read(pcpu_hot.hardirq_stack_ptr), \
func, asm_call, argconstr)
/* Macros to assert type correctness for run_*_on_irqstack macros */
@@ -135,7 +135,7 @@
* User mode entry and interrupt on the irq stack do not \
* switch stacks. If from user mode the task stack is empty. \
*/ \
- if (user_mode(regs) || __this_cpu_read(hardirq_stack_inuse)) { \
+ if (user_mode(regs) || __this_cpu_read(pcpu_hot.hardirq_stack_inuse)) { \
irq_enter_rcu(); \
func(c_args); \
irq_exit_rcu(); \
@@ -146,9 +146,9 @@
* places. Invoke the stack switch macro with the call \
* sequence which matches the above direct invocation. \
*/ \
- __this_cpu_write(hardirq_stack_inuse, true); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
call_on_irqstack(func, asm_call, constr); \
- __this_cpu_write(hardirq_stack_inuse, false); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
} \
}
@@ -212,9 +212,9 @@
*/
#define do_softirq_own_stack() \
{ \
- __this_cpu_write(hardirq_stack_inuse, true); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, true); \
call_on_irqstack(__do_softirq, ASM_CALL_ARG0); \
- __this_cpu_write(hardirq_stack_inuse, false); \
+ __this_cpu_write(pcpu_hot.hardirq_stack_inuse, false); \
}
#endif
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index 125c23b7bad3..30c325c235c0 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -7,9 +7,7 @@
#ifdef CONFIG_X86_LOCAL_APIC
enum {
- /* Allocate contiguous CPU vectors */
- X86_IRQ_ALLOC_CONTIGUOUS_VECTORS = 0x1,
- X86_IRQ_ALLOC_LEGACY = 0x2,
+ X86_IRQ_ALLOC_LEGACY = 0x1,
};
extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 7793e52d6237..8c5ae649d2df 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -8,9 +8,6 @@
#include <asm/nospec-branch.h>
-/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
-#define __cpuidle __section(".cpuidle.text")
-
/*
* Interrupt control:
*/
@@ -45,13 +42,13 @@ static __always_inline void native_irq_enable(void)
asm volatile("sti": : :"memory");
}
-static inline __cpuidle void native_safe_halt(void)
+static __always_inline void native_safe_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("sti; hlt": : :"memory");
}
-static inline __cpuidle void native_halt(void)
+static __always_inline void native_halt(void)
{
mds_idle_clear_cpu_buffers();
asm volatile("hlt": : :"memory");
@@ -84,7 +81,7 @@ static __always_inline void arch_local_irq_enable(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline __cpuidle void arch_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -93,7 +90,7 @@ static inline __cpuidle void arch_safe_halt(void)
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline __cpuidle void halt(void)
+static __always_inline void halt(void)
{
native_halt();
}
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 13e70da38bed..de75306b932e 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -28,9 +28,12 @@
#ifdef CONFIG_KASAN
void __init kasan_early_init(void);
void __init kasan_init(void);
+void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid);
#else
static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
+static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size,
+ int nid) { }
#endif
#endif
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index a3760ca796aa..5b77bbc28f96 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -200,9 +200,6 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
const Elf_Shdr *symtab);
#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
-void *arch_kexec_kernel_image_load(struct kimage *image);
-#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load
-
int arch_kimage_file_post_load_cleanup(struct kimage *image);
#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
#endif
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 82ba4a564e58..13bc212cd4bc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,6 +14,7 @@ BUILD_BUG_ON(1)
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
+KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
@@ -53,8 +54,8 @@ KVM_X86_OP(set_rflags)
KVM_X86_OP(get_if_flag)
KVM_X86_OP(flush_tlb_all)
KVM_X86_OP(flush_tlb_current)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush)
-KVM_X86_OP_OPTIONAL(tlb_remote_flush_with_range)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs)
+KVM_X86_OP_OPTIONAL(flush_remote_tlbs_range)
KVM_X86_OP(flush_tlb_gva)
KVM_X86_OP(flush_tlb_guest)
KVM_X86_OP(vcpu_pre_run)
@@ -67,6 +68,8 @@ KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
KVM_X86_OP(inject_irq)
KVM_X86_OP(inject_nmi)
+KVM_X86_OP_OPTIONAL_RET0(is_vnmi_pending)
+KVM_X86_OP_OPTIONAL_RET0(set_vnmi_pending)
KVM_X86_OP(inject_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
@@ -76,7 +79,6 @@ KVM_X86_OP(set_nmi_mask)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP_OPTIONAL(update_cr8_intercept)
-KVM_X86_OP(check_apicv_inhibit_reasons)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
@@ -110,10 +112,12 @@ KVM_X86_OP_OPTIONAL_RET0(dy_apicv_has_pending_interrupt)
KVM_X86_OP_OPTIONAL(set_hv_timer)
KVM_X86_OP_OPTIONAL(cancel_hv_timer)
KVM_X86_OP(setup_mce)
+#ifdef CONFIG_KVM_SMM
KVM_X86_OP(smi_allowed)
KVM_X86_OP(enter_smm)
KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
+#endif
KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
@@ -123,7 +127,7 @@ KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
KVM_X86_OP(get_msr_feature)
KVM_X86_OP(can_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
-KVM_X86_OP_OPTIONAL(enable_direct_tlbflush)
+KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f05ebaa26f0f..fb9d1f2d6136 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
#include <linux/clocksource.h>
#include <linux/irqbypass.h>
#include <linux/hyperv.h>
+#include <linux/kfifo.h>
#include <asm/apic.h>
#include <asm/pvclock-abi.h>
@@ -81,7 +82,9 @@
#define KVM_REQ_NMI KVM_ARCH_REQ(9)
#define KVM_REQ_PMU KVM_ARCH_REQ(10)
#define KVM_REQ_PMI KVM_ARCH_REQ(11)
+#ifdef CONFIG_KVM_SMM
#define KVM_REQ_SMI KVM_ARCH_REQ(12)
+#endif
#define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13)
#define KVM_REQ_MCLOCK_INPROGRESS \
KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
@@ -108,6 +111,8 @@
KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_HV_TLB_FLUSH \
+ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -129,8 +134,6 @@
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-#define INVALID_GPA (~(gpa_t)0)
-
/* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
@@ -204,6 +207,7 @@ typedef enum exit_fastpath_completion fastpath_t;
struct x86_emulate_ctxt;
struct x86_exception;
+union kvm_smram;
enum x86_intercept;
enum x86_intercept_stage;
@@ -253,16 +257,16 @@ enum x86_intercept_stage;
#define PFERR_GUEST_PAGE_BIT 33
#define PFERR_IMPLICIT_ACCESS_BIT 48
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
-#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
-#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
-#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
-#define PFERR_IMPLICIT_ACCESS (1ULL << PFERR_IMPLICIT_ACCESS_BIT)
+#define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT)
+#define PFERR_USER_MASK BIT(PFERR_USER_BIT)
+#define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT)
+#define PFERR_PK_MASK BIT(PFERR_PK_BIT)
+#define PFERR_SGX_MASK BIT(PFERR_SGX_BIT)
+#define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT)
+#define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT)
#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
PFERR_WRITE_MASK | \
@@ -416,6 +420,10 @@ struct kvm_mmu_root_info {
#define KVM_MMU_NUM_PREV_ROOTS 3
+#define KVM_MMU_ROOT_CURRENT BIT(0)
+#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
+#define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
+
#define KVM_HAVE_MMU_RWLOCK
struct kvm_mmu_page;
@@ -435,9 +443,8 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t gva_or_gpa, u64 access,
struct x86_exception *exception);
- int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp);
- void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
+ int (*sync_spte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, int i);
struct kvm_mmu_root_info root;
union kvm_cpu_role cpu_role;
union kvm_mmu_page_role root_role;
@@ -475,11 +482,6 @@ struct kvm_mmu {
u64 pdptrs[4]; /* pae */
};
-struct kvm_tlb_range {
- u64 start_gfn;
- u64 pages;
-};
-
enum pmc_type {
KVM_PMC_GP = 0,
KVM_PMC_FIXED,
@@ -488,17 +490,19 @@ enum pmc_type {
struct kvm_pmc {
enum pmc_type type;
u8 idx;
+ bool is_paused;
+ bool intr;
u64 counter;
+ u64 prev_counter;
u64 eventsel;
struct perf_event *perf_event;
struct kvm_vcpu *vcpu;
/*
+ * only for creating or reusing perf_event,
* eventsel value for general purpose counters,
* ctrl value for fixed counters.
*/
u64 current_config;
- bool is_paused;
- bool intr;
};
/* More counters may conflict with other existing Architectural MSRs */
@@ -506,8 +510,10 @@ struct kvm_pmc {
#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define KVM_PMC_MAX_FIXED 3
+#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
+ u8 version;
unsigned nr_arch_gp_counters;
unsigned nr_arch_fixed_counters;
unsigned available_event_types;
@@ -520,11 +526,19 @@ struct kvm_pmu {
u64 global_ovf_ctrl_mask;
u64 reserved_bits;
u64 raw_event_mask;
- u8 version;
struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC];
struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED];
struct irq_work irq_work;
- DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+
+ /*
+ * Overlay the bitmap with a 64-bit atomic so that all bits can be
+ * set in a single access, e.g. to reprogram all counters when the PMU
+ * filter changes.
+ */
+ union {
+ DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
+ atomic64_t __reprogram_pmi;
+ };
DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
@@ -602,6 +616,29 @@ struct kvm_vcpu_hv_synic {
bool dont_zero_synic_pages;
};
+/* The maximum number of entries on the TLB flush fifo. */
+#define KVM_HV_TLB_FLUSH_FIFO_SIZE (16)
+/*
+ * Note: the following 'magic' entry is made up by KVM to avoid putting
+ * anything besides GVA on the TLB flush fifo. It is theoretically possible
+ * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000
+ * which will look identical. KVM's action to 'flush everything' instead of
+ * flushing these particular addresses is, however, fully legitimate as
+ * flushing more than requested is always OK.
+ */
+#define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1)
+
+enum hv_tlb_flush_fifos {
+ HV_L1_TLB_FLUSH_FIFO,
+ HV_L2_TLB_FLUSH_FIFO,
+ HV_NR_TLB_FLUSH_FIFOS,
+};
+
+struct kvm_vcpu_hv_tlb_flush_fifo {
+ spinlock_t write_lock;
+ DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+};
+
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
struct kvm_vcpu *vcpu;
@@ -623,6 +660,24 @@ struct kvm_vcpu_hv {
u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
} cpuid_cache;
+
+ struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
+
+ /* Preallocated buffer for handling hypercalls passing sparse vCPU set */
+ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
+
+ struct hv_vp_assist_page vp_assist_page;
+
+ struct {
+ u64 pa_page_gpa;
+ u64 vm_id;
+ u32 vp_id;
+ } nested;
+};
+
+struct kvm_hypervisor_cpuid {
+ u32 base;
+ u32 limit;
};
/* Xen HVM per vcpu emulation context */
@@ -633,6 +688,7 @@ struct kvm_vcpu_xen {
struct gfn_to_pfn_cache vcpu_info_cache;
struct gfn_to_pfn_cache vcpu_time_info_cache;
struct gfn_to_pfn_cache runstate_cache;
+ struct gfn_to_pfn_cache runstate2_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
@@ -644,6 +700,7 @@ struct kvm_vcpu_xen {
struct hrtimer timer;
int poll_evtchn;
struct timer_list poll_timer;
+ struct kvm_hypervisor_cpuid cpuid;
};
struct kvm_queued_exception {
@@ -772,7 +829,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- u32 kvm_cpuid_base;
+ struct kvm_hypervisor_cpuid kvm_cpuid;
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -817,7 +874,8 @@ struct kvm_vcpu_arch {
u64 tsc_scaling_ratio; /* current scaling ratio */
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
- unsigned nmi_pending; /* NMI queued after currently running handler */
+ /* Number of NMIs pending injection, not including hardware vNMIs. */
+ unsigned int nmi_pending;
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
u8 handling_intr_from_guest;
@@ -888,23 +946,6 @@ struct kvm_vcpu_arch {
u64 msr_kvm_poll_control;
- /*
- * Indicates the guest is trying to write a gfn that contains one or
- * more of the PTEs used to translate the write itself, i.e. the access
- * is changing its own translation in the guest page tables. KVM exits
- * to userspace if emulation of the faulting instruction fails and this
- * flag is set, as KVM cannot make forward progress.
- *
- * If emulation fails for a write to guest page tables, KVM unprotects
- * (zaps) the shadow page for the target gfn and resumes the guest to
- * retry the non-emulatable instruction (on hardware). Unprotecting the
- * gfn doesn't allow forward progress for a self-changing access because
- * doing so also zaps the translation for the gfn, i.e. retrying the
- * instruction will hit a !PRESENT fault, which results in a new shadow
- * page and sends KVM back to square one.
- */
- bool write_fault_to_shadow_pgtable;
-
/* set at EPT violation at this point */
unsigned long exit_qualification;
@@ -968,19 +1009,30 @@ struct kvm_arch_memory_slot {
};
/*
- * We use as the mode the number of bits allocated in the LDR for the
- * logical processor ID. It happens that these are all powers of two.
- * This makes it is very easy to detect cases where the APICs are
- * configured for multiple modes; in that case, we cannot use the map and
- * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ * Track the mode of the optimized logical map, as the rules for decoding the
+ * destination vary per mode. Enabling the optimized logical map requires all
+ * software-enabled local APIs to be in the same mode, each addressable APIC to
+ * be mapped to only one MDA, and each MDA to map to at most one APIC.
*/
-#define KVM_APIC_MODE_XAPIC_CLUSTER 4
-#define KVM_APIC_MODE_XAPIC_FLAT 8
-#define KVM_APIC_MODE_X2APIC 16
+enum kvm_apic_logical_mode {
+ /* All local APICs are software disabled. */
+ KVM_APIC_MODE_SW_DISABLED,
+ /* All software enabled local APICs in xAPIC cluster addressing mode. */
+ KVM_APIC_MODE_XAPIC_CLUSTER,
+ /* All software enabled local APICs in xAPIC flat addressing mode. */
+ KVM_APIC_MODE_XAPIC_FLAT,
+ /* All software enabled local APICs in x2APIC mode. */
+ KVM_APIC_MODE_X2APIC,
+ /*
+ * Optimized map disabled, e.g. not all local APICs in the same logical
+ * mode, same logical ID assigned to multiple APICs, etc.
+ */
+ KVM_APIC_MODE_MAP_DISABLED,
+};
struct kvm_apic_map {
struct rcu_head rcu;
- u8 mode;
+ enum kvm_apic_logical_mode logical_mode;
u32 max_apic_id;
union {
struct kvm_lapic *xapic_flat_map[8];
@@ -1034,6 +1086,7 @@ struct kvm_hv {
u64 hv_reenlightenment_control;
u64 hv_tsc_emulation_control;
u64 hv_tsc_emulation_status;
+ u64 hv_invtsc_control;
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
@@ -1057,8 +1110,10 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ struct mutex xen_lock;
u32 xen_version;
bool long_mode;
+ bool runstate_update_flag;
u8 upcall_vector;
struct gfn_to_pfn_cache shinfo_cache;
struct idr evtchn_ports;
@@ -1077,6 +1132,18 @@ struct kvm_x86_msr_filter {
struct msr_bitmap_range ranges[16];
};
+struct kvm_x86_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 nr_includes;
+ __u32 nr_excludes;
+ __u64 *includes;
+ __u64 *excludes;
+ __u64 events[];
+};
+
enum kvm_apicv_inhibit {
/********************************************************************/
@@ -1108,6 +1175,12 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_BLOCKIRQ,
/*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
+ /*
* For simplicity, the APIC acceleration is inhibited
* first time either APIC ID or APIC base are changed by the guest
* from their reset values.
@@ -1145,6 +1218,12 @@ enum kvm_apicv_inhibit {
* AVIC is disabled because SEV doesn't support it.
*/
APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
};
struct kvm_arch {
@@ -1156,7 +1235,18 @@ struct kvm_arch {
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
- struct list_head lpage_disallowed_mmu_pages;
+ /*
+ * A list of kvm_mmu_page structs that, if zapped, could possibly be
+ * replaced by an NX huge page. A shadow page is on this list if its
+ * existence disallows an NX huge page (nx_huge_page_disallowed is set)
+ * and there are no other conditions that prevent a huge page, e.g.
+ * the backing host page is huge, dirtly logging is not enabled for its
+ * memslot, etc... Note, zapping shadow pages on this list doesn't
+ * guarantee an NX huge page will be created in its stead, e.g. if the
+ * guest attempts to execute from the region then KVM obviously can't
+ * create an NX huge page (without hanging the guest).
+ */
+ struct list_head possible_nx_huge_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
/*
@@ -1182,10 +1272,11 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
- struct rw_semaphore apicv_update_lock;
-
bool apic_access_memslot_enabled;
+ bool apic_access_memslot_inhibited;
+
+ /* Protects apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1235,7 +1326,6 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
- int cpu_dirty_logging_count;
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
@@ -1271,22 +1361,16 @@ struct kvm_arch {
/* Guest can access the SGX PROVISIONKEY. */
bool sgx_provisioning_allowed;
- struct kvm_pmu_event_filter __rcu *pmu_event_filter;
- struct task_struct *nx_lpage_recovery_thread;
+ struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
+ struct task_struct *nx_huge_page_recovery_thread;
#ifdef CONFIG_X86_64
- /*
- * Whether the TDP MMU is enabled for this VM. This contains a
- * snapshot of the TDP MMU module parameter from when the VM was
- * created and remains unchanged for the life of the VM. If this is
- * true, TDP MMU handler functions will run for various MMU
- * operations.
- */
- bool tdp_mmu_enabled;
+ /* The number of TDP MMU pages across all roots. */
+ atomic64_t tdp_mmu_pages;
/*
- * List of kvm_mmu_page structs being used as roots.
- * All kvm_mmu_page structs in the list should have
+ * List of struct kvm_mmu_pages being used as roots.
+ * All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
*
* For reads, this list is protected by:
@@ -1305,20 +1389,12 @@ struct kvm_arch {
struct list_head tdp_mmu_roots;
/*
- * List of kvm_mmu_page structs not being used as roots.
- * All kvm_mmu_page structs in the list should have
- * tdp_mmu_page set and a tdp_mmu_root_count of 0.
- */
- struct list_head tdp_mmu_pages;
-
- /*
* Protects accesses to the following fields when the MMU lock
* is held in read mode:
* - tdp_mmu_roots (above)
- * - tdp_mmu_pages (above)
* - the link field of kvm_mmu_page structs used by the TDP MMU
- * - lpage_disallowed_mmu_pages
- * - the lpage_disallowed_link field of kvm_mmu_page structs used
+ * - possible_nx_huge_pages;
+ * - the possible_nx_huge_page_link field of kvm_mmu_page structs used
* by the TDP MMU
* It is acceptable, but not necessary, to acquire this lock when
* the thread holds the MMU lock in write mode.
@@ -1458,6 +1534,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
struct kvm_x86_ops {
const char *name;
+ int (*check_processor_compatibility)(void);
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
@@ -1506,9 +1584,9 @@ struct kvm_x86_ops {
void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
- int (*tlb_remote_flush)(struct kvm *kvm);
- int (*tlb_remote_flush_with_range)(struct kvm *kvm,
- struct kvm_tlb_range *range);
+ int (*flush_remote_tlbs)(struct kvm *kvm);
+ int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
+ gfn_t nr_pages);
/*
* Flush any TLB entries associated with the given GVA.
@@ -1542,10 +1620,19 @@ struct kvm_x86_ops {
int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
+ /* Whether or not a virtual NMI is pending in hardware. */
+ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
+ /*
+ * Attempt to pend a virtual NMI in harware. Returns %true on success
+ * to allow using static_call_ret0 as the fallback.
+ */
+ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ const unsigned long required_apicv_inhibits;
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
@@ -1612,10 +1699,12 @@ struct kvm_x86_ops {
void (*setup_mce)(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_KVM_SMM
int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
- int (*enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
- int (*leave_smm)(struct kvm_vcpu *vcpu, const char *smstate);
+ int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+ int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
void (*enable_smi_window)(struct kvm_vcpu *vcpu);
+#endif
int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
@@ -1630,7 +1719,7 @@ struct kvm_x86_ops {
void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
- int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
+ int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
void (*migrate_timers)(struct kvm_vcpu *vcpu);
void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
@@ -1663,12 +1752,10 @@ struct kvm_x86_nested_ops {
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
+ void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_init_ops {
- int (*cpu_has_kvm_support)(void);
- int (*disabled_by_bios)(void);
- int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
unsigned int (*handle_intel_pt_intr)(void);
@@ -1695,6 +1782,9 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1707,8 +1797,8 @@ void kvm_arch_free_vm(struct kvm *kvm);
#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
{
- if (kvm_x86_ops.tlb_remote_flush &&
- !static_call(kvm_x86_tlb_remote_flush)(kvm))
+ if (kvm_x86_ops.flush_remote_tlbs &&
+ !static_call(kvm_x86_flush_remote_tlbs)(kvm))
return 0;
else
return -ENOTSUPP;
@@ -1806,6 +1896,25 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
* EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
* state and inject single-step #DBs after skipping
* an instruction (after completing userspace I/O).
+ *
+ * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
+ * is attempting to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself,
+ * and the owning page table is being shadowed by KVM.
+ * If emulation of the faulting instruction fails and
+ * this flag is set, KVM will exit to userspace instead
+ * of retrying emulation as KVM cannot make forward
+ * progress.
+ *
+ * If emulation fails for a write to guest page tables,
+ * KVM unprotects (zaps) the shadow page for the target
+ * gfn and resumes the guest to retry the non-emulatable
+ * instruction (on hardware). Unprotecting the gfn
+ * doesn't allow forward progress for a self-changing
+ * access because doing so also zaps the translation for
+ * the gfn, i.e. retrying the instruction will hit a
+ * !PRESENT fault, which results in a new shadow page
+ * and sends KVM back to square one.
*/
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
@@ -1815,6 +1924,7 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
#define EMULTYPE_VMWARE_GP (1 << 5)
#define EMULTYPE_PF (1 << 6)
#define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
+#define EMULTYPE_WRITE_PF_TO_SP (1 << 8)
int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
@@ -1844,6 +1954,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
@@ -1892,14 +2003,11 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
return !!(*irq_state);
}
-#define KVM_MMU_ROOT_CURRENT BIT(0)
-#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i)
-#define KVM_MMU_ROOTS_ALL (~0UL)
-
int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
void kvm_inject_nmi(struct kvm_vcpu *vcpu);
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
@@ -1909,8 +2017,6 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -1918,7 +2024,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
bool kvm_apicv_activated(struct kvm *kvm);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
@@ -1941,8 +2047,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa);
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
@@ -1990,18 +2096,19 @@ enum {
TASK_SWITCH_GATE = 3,
};
-#define HF_GIF_MASK (1 << 0)
-#define HF_NMI_MASK (1 << 3)
-#define HF_IRET_MASK (1 << 4)
-#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
-#define HF_SMM_MASK (1 << 6)
-#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
+#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */
-#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
-#define KVM_ADDRESS_SPACE_NUM 2
+#ifdef CONFIG_KVM_SMM
+#define HF_SMM_MASK (1 << 1)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
-#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
-#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+# define KVM_ADDRESS_SPACE_NUM 2
+# define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
+#else
+# define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
+#endif
#define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -2089,14 +2196,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
-#define GET_SMSTATE(type, buf, offset) \
- (*(type *)((buf) + (offset) - 0x7e00))
-
-int kvm_cpu_dirty_log_size(void);
-
int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
#define KVM_CLOCK_VALID_FLAGS \
@@ -2111,4 +2210,11 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+/*
+ * KVM previously used a u32 field in kvm_run to indicate the hypercall was
+ * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
+ * remaining 31 lower bits must be 0 to preserve ABI.
+ */
+#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
index 6c5765192102..511b35069187 100644
--- a/arch/x86/include/asm/kvmclock.h
+++ b/arch/x86/include/asm/kvmclock.h
@@ -8,7 +8,7 @@ extern struct clocksource kvm_clock;
DECLARE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
-static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
+static __always_inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
return &this_cpu_read(hv_clock_per_cpu)->pvti;
}
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index f484d656d34e..0953aa32a324 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -12,13 +12,26 @@
#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
#endif /* CONFIG_X86_32 */
-#ifdef __ASSEMBLY__
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_ALIGNMENT_16)
-#define __ALIGN .p2align 4, 0x90
+#define __ALIGN .balign CONFIG_FUNCTION_ALIGNMENT, 0x90;
#define __ALIGN_STR __stringify(__ALIGN)
+
+#if defined(CONFIG_CALL_PADDING) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
+#define FUNCTION_PADDING .skip CONFIG_FUNCTION_ALIGNMENT, 0x90;
+#else
+#define FUNCTION_PADDING
+#endif
+
+#if (CONFIG_FUNCTION_ALIGNMENT > 8) && !defined(__DISABLE_EXPORTS) && !defined(BULID_VDSO)
+# define __FUNC_ALIGN __ALIGN; FUNCTION_PADDING
+#else
+# define __FUNC_ALIGN __ALIGN
#endif
+#define ASM_FUNC_ALIGN __stringify(__FUNC_ALIGN)
+#define SYM_F_ALIGN __FUNC_ALIGN
+
+#ifdef __ASSEMBLY__
+
#if defined(CONFIG_RETHUNK) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO)
#define RET jmp __x86_return_thunk
#else /* CONFIG_RETPOLINE */
@@ -43,21 +56,55 @@
#endif /* __ASSEMBLY__ */
+/*
+ * Depending on -fpatchable-function-entry=N,N usage (CONFIG_CALL_PADDING) the
+ * CFI symbol layout changes.
+ *
+ * Without CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .skip FUNCTION_PADDING, 0x90
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * name:
+ *
+ * With CALL_THUNKS:
+ *
+ * .align FUNCTION_ALIGNMENT
+ * __cfi_##name:
+ * .byte 0xb8
+ * .long __kcfi_typeid_##name
+ * .skip FUNCTION_PADDING, 0x90
+ * name:
+ *
+ * In both cases the whole thing is FUNCTION_ALIGNMENT aligned and sized.
+ */
+
+#ifdef CONFIG_CALL_PADDING
+#define CFI_PRE_PADDING
+#define CFI_POST_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#else
+#define CFI_PRE_PADDING .skip CONFIG_FUNCTION_PADDING_BYTES, 0x90;
+#define CFI_POST_PADDING
+#endif
+
#define __CFI_TYPE(name) \
SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \
- .fill 11, 1, 0x90 ASM_NL \
+ CFI_PRE_PADDING \
.byte 0xb8 ASM_NL \
.long __kcfi_typeid_##name ASM_NL \
+ CFI_POST_PADDING \
SYM_FUNC_END(__cfi_##name)
/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
#define SYM_TYPED_FUNC_START(name) \
- SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START -- use for global functions */
#define SYM_FUNC_START(name) \
- SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_GLOBAL, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_NOALIGN -- use for global functions, w/o alignment */
@@ -67,7 +114,7 @@
/* SYM_FUNC_START_LOCAL -- use for local functions */
#define SYM_FUNC_START_LOCAL(name) \
- SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_LOCAL, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_LOCAL_NOALIGN -- use for local functions, w/o alignment */
@@ -77,7 +124,7 @@
/* SYM_FUNC_START_WEAK -- use for weak functions */
#define SYM_FUNC_START_WEAK(name) \
- SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN) \
+ SYM_START(name, SYM_L_WEAK, SYM_F_ALIGN) \
ENDBR
/* SYM_FUNC_START_WEAK_NOALIGN -- use for weak functions, w/o alignment */
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 349a47acaa4a..56d4ef604b91 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -120,8 +120,17 @@ static inline long local_sub_return(long i, local_t *l)
#define local_inc_return(l) (local_add_return(1, l))
#define local_dec_return(l) (local_sub_return(1, l))
-#define local_cmpxchg(l, o, n) \
- (cmpxchg_local(&((l)->a.counter), (o), (n)))
+static inline long local_cmpxchg(local_t *l, long old, long new)
+{
+ return cmpxchg_local(&l->a.counter, old, new);
+}
+
+static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
+{
+ typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
+ return try_cmpxchg_local(&l->a.counter, __old, new);
+}
+
/* Always has a lock prefix */
#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6e986088817d..9646ed6e8c0b 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -88,6 +88,9 @@
#define MCI_MISC_ADDR_MEM 3 /* memory address */
#define MCI_MISC_ADDR_GENERIC 7 /* generic */
+/* MCi_ADDR register defines */
+#define MCI_ADDR_PHYSADDR GENMASK_ULL(boot_cpu_data.x86_phys_bits - 1, 0)
+
/* CTL2 register defines */
#define MCI_CTL2_CMCI_EN BIT_ULL(30)
#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 72ca90552b6a..b7126701574c 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -56,6 +56,7 @@ void __init sev_es_init_vc_handling(void);
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
+#define sev_status 0ULL
static inline void __init sme_early_encrypt(resource_size_t paddr,
unsigned long size) { }
diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h
index 9ca760e430b9..113b2fa51849 100644
--- a/arch/x86/include/asm/memtype.h
+++ b/arch/x86/include/asm/memtype.h
@@ -6,9 +6,8 @@
#include <asm/pgtable_types.h>
extern bool pat_enabled(void);
-extern void pat_disable(const char *reason);
-extern void pat_init(void);
-extern void init_cache_modes(void);
+extern void pat_bp_init(void);
+extern void pat_cpu_init(void);
extern int memtype_reserve(u64 start, u64 end,
enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 74ecc2bd6cd0..320566a0443d 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -33,8 +33,7 @@ enum ucode_state {
};
struct microcode_ops {
- enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
- bool refresh_fw);
+ enum ucode_state (*request_microcode_fw) (int cpu, struct device *);
void (*microcode_fini_cpu) (int cpu);
@@ -50,7 +49,6 @@ struct microcode_ops {
struct ucode_cpu_info {
struct cpu_signature cpu_sig;
- int valid;
void *mc;
};
extern struct ucode_cpu_info ucode_cpu_info[];
@@ -127,13 +125,13 @@ static inline unsigned int x86_cpuid_family(void)
#ifdef CONFIG_MICROCODE
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
-void reload_early_microcode(void);
+void reload_early_microcode(unsigned int cpu);
extern bool initrd_gone;
void microcode_bsp_resume(void);
#else
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
-static inline void reload_early_microcode(void) { }
+static inline void reload_early_microcode(unsigned int cpu) { }
static inline void microcode_bsp_resume(void) { }
#endif
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index ac31f9140d07..e6662adf3af4 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -47,12 +47,12 @@ struct microcode_amd {
extern void __init load_ucode_amd_bsp(unsigned int family);
extern void load_ucode_amd_ap(unsigned int family);
extern int __init save_microcode_in_initrd_amd(unsigned int family);
-void reload_ucode_amd(void);
+void reload_ucode_amd(unsigned int cpu);
#else
static inline void __init load_ucode_amd_bsp(unsigned int family) {}
static inline void load_ucode_amd_ap(unsigned int family) {}
static inline int __init
save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
-static inline void reload_ucode_amd(void) {}
+static inline void reload_ucode_amd(unsigned int cpu) {}
#endif
#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 4c92cea7e4b5..f1fa979e05bf 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -14,7 +14,8 @@ struct microcode_header_intel {
unsigned int pf;
unsigned int datasize;
unsigned int totalsize;
- unsigned int reserved[3];
+ unsigned int metasize;
+ unsigned int reserved[2];
};
struct microcode_intel {
@@ -41,6 +42,8 @@ struct extended_sigtable {
#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
+#define MC_HEADER_TYPE_MICROCODE 1
+#define MC_HEADER_TYPE_IFS 2
#define get_totalsize(mc) \
(((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea9..0da5c227f490 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,9 +9,13 @@
#include <linux/bits.h>
/* Uprobes on this MM assume 32-bit code */
-#define MM_CONTEXT_UPROBE_IA32 BIT(0)
+#define MM_CONTEXT_UPROBE_IA32 0
/* vsyscall page is accessible on this MM */
-#define MM_CONTEXT_HAS_VSYSCALL BIT(1)
+#define MM_CONTEXT_HAS_VSYSCALL 1
+/* Do not allow changing LAM mode */
+#define MM_CONTEXT_LOCK_LAM 2
+/* Allow LAM and SVA coexisting */
+#define MM_CONTEXT_FORCE_TAGGED_SVA 3
/*
* x86 has arch-specific MMU state beyond what lives in mm_struct.
@@ -39,7 +43,15 @@ typedef struct {
#endif
#ifdef CONFIG_X86_64
- unsigned short flags;
+ unsigned long flags;
+#endif
+
+#ifdef CONFIG_ADDRESS_MASKING
+ /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
+ unsigned long lam_cr3_mask;
+
+ /* Significant bits of the virtual address. Excludes tag bits. */
+ u64 untag_mask;
#endif
struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00..1d29dc791f5a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,16 +12,10 @@
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
+#include <asm/gsseg.h>
extern atomic64_t last_mm_ctx_id;
-#ifndef CONFIG_PARAVIRT_XXL
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
-{
-}
-#endif /* !CONFIG_PARAVIRT_XXL */
-
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
@@ -91,6 +85,51 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
}
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return mm->context.lam_cr3_mask;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+ mm->context.untag_mask = oldmm->context.untag_mask;
+}
+
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return mm->context.untag_mask;
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+ mm->context.untag_mask = -1UL;
+}
+
+#define arch_pgtable_dma_compat arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+ return !mm_lam_cr3_mask(mm) ||
+ test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+}
+#else
+
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+}
+#endif
+
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
@@ -115,6 +154,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
+ mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
}
@@ -134,7 +174,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#define activate_mm(prev, next) \
do { \
- paravirt_activate_mm((prev), (next)); \
+ paravirt_enter_mmap(next); \
switch_mm((prev), (next), NULL); \
} while (0);
@@ -167,7 +207,8 @@ static inline void arch_dup_pkeys(struct mm_struct *oldmm,
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
- paravirt_arch_dup_mmap(oldmm, mm);
+ paravirt_enter_mmap(mm);
+ dup_lam(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
@@ -181,7 +222,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
- !(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
+ !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 61f0c206bff0..49bb4f2bd300 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,18 @@
#include <asm/paravirt.h>
#include <asm/mshyperv.h>
+/*
+ * Hyper-V always provides a single IO-APIC at this MMIO address.
+ * Ideally, the value should be looked up in ACPI tables, but it
+ * is needed for mapping the IO-APIC early in boot on Confidential
+ * VMs, before ACPI functions can be used.
+ */
+#define HV_IOAPIC_BASE_ADDRESS 0xfec00000
+
+#define HV_VTL_NORMAL 0x0
+#define HV_VTL_SECURE 0x1
+#define HV_VTL_MGMT 0x2
+
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
@@ -19,10 +31,13 @@ typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
void *data);
-#define hv_get_raw_timer() rdtsc_ordered()
-
void hyperv_vector_handler(struct pt_regs *regs);
+static inline unsigned char hv_get_nmi_reason(void)
+{
+ return 0;
+}
+
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
@@ -74,10 +89,16 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
return hv_status;
}
+/* Hypercall to the L0 hypervisor */
+static inline u64 hv_do_nested_hypercall(u64 control, void *input, void *output)
+{
+ return hv_do_hypercall(control | HV_HYPERCALL_NESTED, input, output);
+}
+
/* Fast hypercall with 8 bytes of input and no output */
-static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
{
- u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+ u64 hv_status;
#ifdef CONFIG_X86_64
{
@@ -105,10 +126,24 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
return hv_status;
}
+static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall8(control, input1);
+}
+
+static inline u64 hv_do_fast_nested_hypercall8(u16 code, u64 input1)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
+
+ return _hv_do_fast_hypercall8(control, input1);
+}
+
/* Fast hypercall with 16 bytes of input */
-static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
{
- u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
+ u64 hv_status;
#ifdef CONFIG_X86_64
{
@@ -139,6 +174,20 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return hv_status;
}
+static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT;
+
+ return _hv_do_fast_hypercall16(control, input1, input2);
+}
+
+static inline u64 hv_do_fast_nested_hypercall16(u16 code, u64 input1, u64 input2)
+{
+ u64 control = (u64)code | HV_HYPERCALL_FAST_BIT | HV_HYPERCALL_NESTED;
+
+ return _hv_do_fast_hypercall16(control, input1, input2);
+}
+
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -174,54 +223,39 @@ struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
-int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
#ifdef CONFIG_AMD_MEM_ENCRYPT
void hv_ghcb_msr_write(u64 msr, u64 value);
void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
-void hv_ghcb_terminate(unsigned int set, unsigned int reason);
+void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
+void hv_vtom_init(void);
#else
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
+static inline void hv_vtom_init(void) {}
#endif
extern bool hv_isolation_type_snp(void);
static inline bool hv_is_synic_reg(unsigned int reg)
{
- if ((reg >= HV_REGISTER_SCONTROL) &&
- (reg <= HV_REGISTER_SINT15))
- return true;
- return false;
+ return (reg >= HV_REGISTER_SCONTROL) &&
+ (reg <= HV_REGISTER_SINT15);
}
-static inline u64 hv_get_register(unsigned int reg)
+static inline bool hv_is_sint_reg(unsigned int reg)
{
- u64 value;
-
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
- hv_ghcb_msr_read(reg, &value);
- else
- rdmsrl(reg, value);
- return value;
+ return (reg >= HV_REGISTER_SINT0) &&
+ (reg <= HV_REGISTER_SINT15);
}
-static inline void hv_set_register(unsigned int reg, u64 value)
-{
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
- hv_ghcb_msr_write(reg, value);
-
- /* Write proxy bit via wrmsl instruction */
- if (reg >= HV_REGISTER_SINT0 &&
- reg <= HV_REGISTER_SINT15)
- wrmsrl(reg, value | 1 << 20);
- } else {
- wrmsrl(reg, value);
- }
-}
+u64 hv_get_register(unsigned int reg);
+void hv_set_register(unsigned int reg, u64 value);
+u64 hv_get_non_nested_register(unsigned int reg);
+void hv_set_non_nested_register(unsigned int reg, u64 value);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -241,14 +275,17 @@ static inline int hyperv_flush_guest_mapping_range(u64 as,
}
static inline void hv_set_register(unsigned int reg, u64 value) { }
static inline u64 hv_get_register(unsigned int reg) { return 0; }
-static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
- bool visible)
-{
- return -1;
-}
+static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { }
+static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
#endif /* CONFIG_HYPERV */
+#ifdef CONFIG_HYPERV_VTL_MODE
+void __init hv_vtl_init_platform(void);
+#else
+static inline void __init hv_vtl_init_platform(void) {}
+#endif
+
#include <asm-generic/mshyperv.h>
#endif
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
index d71c7e8b738d..935c6d470341 100644
--- a/arch/x86/include/asm/msi.h
+++ b/arch/x86/include/asm/msi.h
@@ -62,4 +62,10 @@ typedef struct x86_msi_addr_hi {
struct msi_msg;
u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid);
+#define X86_VECTOR_MSI_FLAGS_SUPPORTED \
+ (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX | MSI_FLAG_PCI_MSIX_ALLOC_DYN)
+
+#define X86_VECTOR_MSI_FLAGS_REQUIRED \
+ (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+
#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 10ac52705892..3aedae61af4f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -4,12 +4,7 @@
#include <linux/bits.h>
-/*
- * CPU model specific register (MSR) numbers.
- *
- * Do not add new entries to this file unless the definitions are shared
- * between multiple compilation units.
- */
+/* CPU model specific register (MSR) numbers. */
/* x86-64 specific MSRs */
#define MSR_EFER 0xc0000080 /* extended feature register */
@@ -30,6 +25,7 @@
#define _EFER_SVME 12 /* Enable virtualization */
#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */
#define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */
+#define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */
#define EFER_SCE (1<<_EFER_SCE)
#define EFER_LME (1<<_EFER_LME)
@@ -38,6 +34,7 @@
#define EFER_SVME (1<<_EFER_SVME)
#define EFER_LMSLE (1<<_EFER_LMSLE)
#define EFER_FFXSR (1<<_EFER_FFXSR)
+#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
/* Intel MSRs. Some also available on other CPUs */
@@ -54,6 +51,10 @@
#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
+/* A mask for bits which the kernel toggles when controlling mitigations */
+#define SPEC_CTRL_MITIGATIONS_MASK (SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD \
+ | SPEC_CTRL_RRSBA_DIS_S)
+
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
@@ -194,6 +195,9 @@
#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
#define MSR_TURBO_RATIO_LIMIT2 0x000001af
+#define MSR_SNOOP_RSP_0 0x00001328
+#define MSR_SNOOP_RSP_1 0x00001329
+
#define MSR_LBR_SELECT 0x000001c8
#define MSR_LBR_TOS 0x000001c9
@@ -202,6 +206,8 @@
/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
#define MSR_INTEGRITY_CAPS 0x000002d9
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2
+#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
@@ -535,6 +541,11 @@
#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
+
+#define MSR_AMD64_DE_CFG 0xc0011029
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1
+#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT)
+
#define MSR_AMD64_BU_CFG2 0xc001102a
#define MSR_AMD64_IBSFETCHCTL 0xc0011030
#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
@@ -566,6 +577,26 @@
#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT)
+/* SNP feature bits enabled by the hypervisor */
+#define MSR_AMD64_SNP_VTOM BIT_ULL(3)
+#define MSR_AMD64_SNP_REFLECT_VC BIT_ULL(4)
+#define MSR_AMD64_SNP_RESTRICTED_INJ BIT_ULL(5)
+#define MSR_AMD64_SNP_ALT_INJ BIT_ULL(6)
+#define MSR_AMD64_SNP_DEBUG_SWAP BIT_ULL(7)
+#define MSR_AMD64_SNP_PREVENT_HOST_IBS BIT_ULL(8)
+#define MSR_AMD64_SNP_BTB_ISOLATION BIT_ULL(9)
+#define MSR_AMD64_SNP_VMPL_SSS BIT_ULL(10)
+#define MSR_AMD64_SNP_SECURE_TSC BIT_ULL(11)
+#define MSR_AMD64_SNP_VMGEXIT_PARAM BIT_ULL(12)
+#define MSR_AMD64_SNP_IBS_VIRT BIT_ULL(14)
+#define MSR_AMD64_SNP_VMSA_REG_PROTECTION BIT_ULL(16)
+#define MSR_AMD64_SNP_SMT_PROTECTION BIT_ULL(17)
+
+/* SNP feature bits reserved for future use. */
+#define MSR_AMD64_SNP_RESERVED_BIT13 BIT_ULL(13)
+#define MSR_AMD64_SNP_RESERVED_BIT15 BIT_ULL(15)
+#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, 18)
+
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
/* AMD Collaborative Processor Performance Control MSRs */
@@ -640,9 +671,6 @@
#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
-#define MSR_F10H_DECFG 0xc0011029
-#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1
-#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
/* K8 MSRs */
#define MSR_K8_TOP_MEM1 0xc001001a
@@ -796,6 +824,7 @@
#define ENERGY_PERF_BIAS_PERFORMANCE 0
#define ENERGY_PERF_BIAS_BALANCE_PERFORMANCE 4
#define ENERGY_PERF_BIAS_NORMAL 6
+#define ENERGY_PERF_BIAS_NORMAL_POWERSAVE 7
#define ENERGY_PERF_BIAS_BALANCE_POWERSAVE 8
#define ENERGY_PERF_BIAS_POWERSAVE 15
@@ -1050,6 +1079,22 @@
#define VMX_BASIC_MEM_TYPE_WB 6LLU
#define VMX_BASIC_INOUT 0x0040000000000000LLU
+/* Resctrl MSRs: */
+/* - Intel: */
+#define MSR_IA32_L3_QOS_CFG 0xc81
+#define MSR_IA32_L2_QOS_CFG 0xc82
+#define MSR_IA32_QM_EVTSEL 0xc8d
+#define MSR_IA32_QM_CTR 0xc8e
+#define MSR_IA32_PQR_ASSOC 0xc8f
+#define MSR_IA32_L3_CBM_BASE 0xc90
+#define MSR_IA32_L2_CBM_BASE 0xd10
+#define MSR_IA32_MBA_THRTL_BASE 0xd50
+
+/* - AMD: */
+#define MSR_IA32_MBA_BW_BASE 0xc0000200
+#define MSR_IA32_SMBA_BW_BASE 0xc0000280
+#define MSR_IA32_EVT_CFG_BASE 0xc0000400
+
/* MSR_IA32_VMX_MISC bits */
#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 76d726074c16..f0eeaf6e5f5f 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -25,13 +25,12 @@
#include <uapi/asm/mtrr.h>
-void mtrr_bp_init(void);
-
/*
* The following functions are for use by other drivers that cannot use
* arch_phys_wc_add and arch_phys_wc_del.
*/
# ifdef CONFIG_MTRR
+void mtrr_bp_init(void);
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
@@ -42,12 +41,12 @@ extern int mtrr_add_page(unsigned long base, unsigned long size,
extern int mtrr_del(int reg, unsigned long base, unsigned long size);
extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-extern void mtrr_ap_init(void);
-extern void set_mtrr_aps_delayed_init(void);
-extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
+void mtrr_disable(void);
+void mtrr_enable(void);
+void mtrr_generic_set_state(void);
# else
static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
{
@@ -83,10 +82,11 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-#define mtrr_ap_init() do {} while (0)
-#define set_mtrr_aps_delayed_init() do {} while (0)
-#define mtrr_aps_init() do {} while (0)
+#define mtrr_bp_init() do {} while (0)
#define mtrr_bp_restore() do {} while (0)
+#define mtrr_disable() do {} while (0)
+#define mtrr_enable() do {} while (0)
+#define mtrr_generic_set_state() do {} while (0)
# endif
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 3a8fdf881313..778df05f8539 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -26,7 +26,7 @@
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0
-static inline void __monitor(const void *eax, unsigned long ecx,
+static __always_inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
@@ -34,7 +34,7 @@ static inline void __monitor(const void *eax, unsigned long ecx,
:: "a" (eax), "c" (ecx), "d"(edx));
}
-static inline void __monitorx(const void *eax, unsigned long ecx,
+static __always_inline void __monitorx(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitorx %eax, %ecx, %edx;" */
@@ -42,7 +42,7 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
:: "a" (eax), "c" (ecx), "d"(edx));
}
-static inline void __mwait(unsigned long eax, unsigned long ecx)
+static __always_inline void __mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
@@ -77,8 +77,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
* EAX (logical) address to monitor
* ECX #GP if not zero
*/
-static inline void __mwaitx(unsigned long eax, unsigned long ebx,
- unsigned long ecx)
+static __always_inline void __mwaitx(unsigned long eax, unsigned long ebx,
+ unsigned long ecx)
{
/* No MDS buffer clear as this is AMD/HYGON only */
@@ -87,7 +87,7 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
:: "a" (eax), "b" (ebx), "c" (ecx));
}
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+static __always_inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
@@ -105,7 +105,7 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
-static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+static __always_inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index c936ce9f0c47..edb2b0cb8efe 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -12,8 +12,104 @@
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
#include <asm/percpu.h>
+#include <asm/current.h>
-#define RETPOLINE_THUNK_SIZE 32
+/*
+ * Call depth tracking for Intel SKL CPUs to address the RSB underflow
+ * issue in software.
+ *
+ * The tracking does not use a counter. It uses uses arithmetic shift
+ * right on call entry and logical shift left on return.
+ *
+ * The depth tracking variable is initialized to 0x8000.... when the call
+ * depth is zero. The arithmetic shift right sign extends the MSB and
+ * saturates after the 12th call. The shift count is 5 for both directions
+ * so the tracking covers 12 nested calls.
+ *
+ * Call
+ * 0: 0x8000000000000000 0x0000000000000000
+ * 1: 0xfc00000000000000 0xf000000000000000
+ * ...
+ * 11: 0xfffffffffffffff8 0xfffffffffffffc00
+ * 12: 0xffffffffffffffff 0xffffffffffffffe0
+ *
+ * After a return buffer fill the depth is credited 12 calls before the
+ * next stuffing has to take place.
+ *
+ * There is a inaccuracy for situations like this:
+ *
+ * 10 calls
+ * 5 returns
+ * 3 calls
+ * 4 returns
+ * 3 calls
+ * ....
+ *
+ * The shift count might cause this to be off by one in either direction,
+ * but there is still a cushion vs. the RSB depth. The algorithm does not
+ * claim to be perfect and it can be speculated around by the CPU, but it
+ * is considered that it obfuscates the problem enough to make exploitation
+ * extremly difficult.
+ */
+#define RET_DEPTH_SHIFT 5
+#define RSB_RET_STUFF_LOOPS 16
+#define RET_DEPTH_INIT 0x8000000000000000ULL
+#define RET_DEPTH_INIT_FROM_CALL 0xfc00000000000000ULL
+#define RET_DEPTH_CREDIT 0xffffffffffffffffULL
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+# define CALL_THUNKS_DEBUG_INC_CALLS \
+ incq %gs:__x86_call_count;
+# define CALL_THUNKS_DEBUG_INC_RETS \
+ incq %gs:__x86_ret_count;
+# define CALL_THUNKS_DEBUG_INC_STUFFS \
+ incq %gs:__x86_stuffs_count;
+# define CALL_THUNKS_DEBUG_INC_CTXSW \
+ incq %gs:__x86_ctxsw_count;
+#else
+# define CALL_THUNKS_DEBUG_INC_CALLS
+# define CALL_THUNKS_DEBUG_INC_RETS
+# define CALL_THUNKS_DEBUG_INC_STUFFS
+# define CALL_THUNKS_DEBUG_INC_CTXSW
+#endif
+
+#if defined(CONFIG_CALL_DEPTH_TRACKING) && !defined(COMPILE_OFFSETS)
+
+#include <asm/asm-offsets.h>
+
+#define CREDIT_CALL_DEPTH \
+ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define ASM_CREDIT_CALL_DEPTH \
+ movq $-1, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH \
+ mov $0x80, %rax; \
+ shl $56, %rax; \
+ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth);
+
+#define RESET_CALL_DEPTH_FROM_CALL \
+ mov $0xfc, %rax; \
+ shl $56, %rax; \
+ movq %rax, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#define INCREMENT_CALL_DEPTH \
+ sarq $5, %gs:pcpu_hot + X86_call_depth; \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#define ASM_INCREMENT_CALL_DEPTH \
+ sarq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth); \
+ CALL_THUNKS_DEBUG_INC_CALLS
+
+#else
+#define CREDIT_CALL_DEPTH
+#define ASM_CREDIT_CALL_DEPTH
+#define RESET_CALL_DEPTH
+#define INCREMENT_CALL_DEPTH
+#define ASM_INCREMENT_CALL_DEPTH
+#define RESET_CALL_DEPTH_FROM_CALL
+#endif
/*
* Fill the CPU return stack buffer.
@@ -32,6 +128,7 @@
* from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
*/
+#define RETPOLINE_THUNK_SIZE 32
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
/*
@@ -60,7 +157,9 @@
dec reg; \
jnz 771b; \
/* barrier for jnz misprediction */ \
- lfence;
+ lfence; \
+ ASM_CREDIT_CALL_DEPTH \
+ CALL_THUNKS_DEBUG_INC_CTXSW
#else
/*
* i386 doesn't unconditionally have LFENCE, as such it can't
@@ -95,9 +194,9 @@
* builds.
*/
.macro ANNOTATE_RETPOLINE_SAFE
- .Lannotate_\@:
+.Lhere_\@:
.pushsection .discard.retpoline_safe
- _ASM_PTR .Lannotate_\@
+ .long .Lhere_\@ - .
.popsection
.endm
@@ -111,8 +210,8 @@
* Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
* eventually turn into it's own annotation.
*/
-.macro ANNOTATE_UNRET_END
-#ifdef CONFIG_DEBUG_ENTRY
+.macro VALIDATE_UNRET_END
+#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
ANNOTATE_RETPOLINE_SAFE
nop
#endif
@@ -162,7 +261,7 @@
.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2=ALT_NOT(X86_FEATURE_ALWAYS)
ALTERNATIVE_2 "jmp .Lskip_rsb_\@", \
__stringify(__FILL_RETURN_BUFFER(\reg,\nr)), \ftr, \
- __stringify(__FILL_ONE_RETURN), \ftr2
+ __stringify(nop;nop;__FILL_ONE_RETURN), \ftr2
.Lskip_rsb_\@:
.endm
@@ -185,11 +284,32 @@
* where we have a stack but before any RET instruction.
*/
.macro UNTRAIN_RET
-#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY)
- ANNOTATE_UNRET_END
- ALTERNATIVE_2 "", \
- CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
- "call entry_ibpb", X86_FEATURE_ENTRY_IBPB
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING)
+ VALIDATE_UNRET_END
+ ALTERNATIVE_3 "", \
+ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
+ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+.macro UNTRAIN_RET_FROM_CALL
+#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
+ defined(CONFIG_CALL_DEPTH_TRACKING)
+ VALIDATE_UNRET_END
+ ALTERNATIVE_3 "", \
+ CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
+ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
+ __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH
+#endif
+.endm
+
+
+.macro CALL_DEPTH_ACCOUNT
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ ALTERNATIVE "", \
+ __stringify(ASM_INCREMENT_CALL_DEPTH), X86_FEATURE_CALL_DEPTH
#endif
.endm
@@ -198,16 +318,50 @@
#define ANNOTATE_RETPOLINE_SAFE \
"999:\n\t" \
".pushsection .discard.retpoline_safe\n\t" \
- _ASM_PTR " 999b\n\t" \
+ ".long 999b - .\n\t" \
".popsection\n\t"
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
extern retpoline_thunk_t __x86_indirect_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
+extern retpoline_thunk_t __x86_indirect_jump_thunk_array[];
extern void __x86_return_thunk(void);
extern void zen_untrain_ret(void);
extern void entry_ibpb(void);
+#ifdef CONFIG_CALL_THUNKS
+extern void (*x86_return_thunk)(void);
+#else
+#define x86_return_thunk (&__x86_return_thunk)
+#endif
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+extern void __x86_return_skl(void);
+
+static inline void x86_set_skl_return_thunk(void)
+{
+ x86_return_thunk = &__x86_return_skl;
+}
+
+#define CALL_DEPTH_ACCOUNT \
+ ALTERNATIVE("", \
+ __stringify(INCREMENT_CALL_DEPTH), \
+ X86_FEATURE_CALL_DEPTH)
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DECLARE_PER_CPU(u64, __x86_call_count);
+DECLARE_PER_CPU(u64, __x86_ret_count);
+DECLARE_PER_CPU(u64, __x86_stuffs_count);
+DECLARE_PER_CPU(u64, __x86_ctxsw_count);
+#endif
+#else
+static inline void x86_set_skl_return_thunk(void) {}
+
+#define CALL_DEPTH_ACCOUNT ""
+
+#endif
+
#ifdef CONFIG_RETPOLINE
#define GEN(reg) \
@@ -215,6 +369,16 @@ extern void entry_ibpb(void);
#include <asm/GEN-for-each-reg.h>
#undef GEN
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_call_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#define GEN(reg) \
+ extern retpoline_thunk_t __x86_indirect_jump_thunk_ ## reg;
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
#ifdef CONFIG_X86_64
/*
@@ -321,7 +485,7 @@ static inline void indirect_branch_prediction_barrier(void)
/* The Intel SPEC CTRL MSR base value cache */
extern u64 x86_spec_ctrl_base;
DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
-extern void write_spec_ctrl_current(u64 val, bool force);
+extern void update_spec_ctrl_cond(u64 val);
extern u64 spec_ctrl_current(void);
/*
@@ -400,7 +564,7 @@ static __always_inline void mds_user_clear_cpu_buffers(void)
*
* Clear CPU buffers if the corresponding static key is enabled
*/
-static inline void mds_idle_clear_cpu_buffers(void)
+static __always_inline void mds_idle_clear_cpu_buffers(void)
{
if (static_branch_likely(&mds_idle_clear))
mds_clear_cpu_buffers();
diff --git a/arch/x86/include/asm/orc_header.h b/arch/x86/include/asm/orc_header.h
new file mode 100644
index 000000000000..07bacf3e160e
--- /dev/null
+++ b/arch/x86/include/asm/orc_header.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) Meta Platforms, Inc. and affiliates. */
+
+#ifndef _ORC_HEADER_H
+#define _ORC_HEADER_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/orc_hash.h>
+
+/*
+ * The header is currently a 20-byte hash of the ORC entry definition; see
+ * scripts/orc_hash.sh.
+ */
+#define ORC_HEADER \
+ __used __section(".orc_header") __aligned(4) \
+ static const u8 orc_header[] = { ORC_HASH }
+
+#endif /* _ORC_HEADER_H */
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 5a2baf28a1dc..46d7e06763c9 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -39,6 +39,12 @@
#define ORC_REG_SP_INDIRECT 9
#define ORC_REG_MAX 15
+#define ORC_TYPE_UNDEFINED 0
+#define ORC_TYPE_END_OF_STACK 1
+#define ORC_TYPE_CALL 2
+#define ORC_TYPE_REGS 3
+#define ORC_TYPE_REGS_PARTIAL 4
+
#ifndef __ASSEMBLY__
#include <asm/byteorder.h>
@@ -56,14 +62,14 @@ struct orc_entry {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
- unsigned type:2;
- unsigned end:1;
+ unsigned type:3;
+ unsigned signal:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned bp_reg:4;
unsigned sp_reg:4;
- unsigned unused:5;
- unsigned end:1;
- unsigned type:2;
+ unsigned unused:4;
+ unsigned signal:1;
+ unsigned type:3;
#endif
} __packed;
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 9cc82f305f4b..d18e5c332cb9 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -34,9 +34,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
copy_page(to, from);
}
-#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
- alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
-#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
+#define vma_alloc_zeroed_movable_folio(vma, vaddr) \
+ vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false)
#ifndef __pa
#define __pa(x) __phys_addr((unsigned long)(x))
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
index df42f8aa99e4..580d71aca65a 100644
--- a/arch/x86/include/asm/page_32.h
+++ b/arch/x86/include/asm/page_32.h
@@ -15,10 +15,6 @@ extern unsigned long __phys_addr(unsigned long);
#define __phys_addr_symbol(x) __phys_addr(x)
#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_mapnr)
-#endif /* CONFIG_FLATMEM */
-
#include <linux/string.h>
static inline void clear_page(void *page)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 198e03e59ca1..cc6b8e087192 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,10 +39,6 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define __phys_reloc_hide(x) (x)
-#ifdef CONFIG_FLATMEM
-#define pfn_valid(pfn) ((pfn) < max_pfn)
-#endif
-
void clear_page_orig(void *page);
void clear_page_rep(void *page);
void clear_page_erms(void *page);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index e9e2c3ba5923..06ef25411d62 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -49,7 +49,7 @@
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
-/* See Documentation/x86/x86_64/mm.rst for a description of the memory map. */
+/* See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map. */
#define __PHYSICAL_MASK_SHIFT 52
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a506a411474d..86bd4311daf8 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -11,20 +11,14 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
-#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
-#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
-
-#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
-#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
-
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
-/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
+/* Cast P*D_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
-#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
-#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK)
+#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2a0b8dd4ec33..b49778664d2b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -4,13 +4,13 @@
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
+#include <asm/paravirt_types.h>
+
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
#include <asm/nospec-branch.h>
-#include <asm/paravirt_types.h>
-
#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>
@@ -26,7 +26,7 @@ DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
void paravirt_set_sched_clock(u64 (*func)(void));
-static inline u64 paravirt_sched_clock(void)
+static __always_inline u64 paravirt_sched_clock(void)
{
return static_call(pv_sched_clock)();
}
@@ -168,7 +168,7 @@ static inline void __write_cr4(unsigned long x)
PVOP_VCALL1(cpu.write_cr4, x);
}
-static inline void arch_safe_halt(void)
+static __always_inline void arch_safe_halt(void)
{
PVOP_VCALL0(irq.safe_halt);
}
@@ -178,7 +178,9 @@ static inline void halt(void)
PVOP_VCALL0(irq.halt);
}
-static inline void wbinvd(void)
+extern noinstr void pv_native_wbinvd(void);
+
+static __always_inline void wbinvd(void)
{
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
}
@@ -332,16 +334,9 @@ static inline void tss_update_io_bitmap(void)
}
#endif
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
+static inline void paravirt_enter_mmap(struct mm_struct *next)
{
- PVOP_VCALL2(mmu.activate_mm, prev, next);
-}
-
-static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
- PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
+ PVOP_VCALL1(mmu.enter_mmap, next);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -665,6 +660,7 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
asm(".pushsection " section ", \"ax\";" \
".globl " PV_THUNK_NAME(func) ";" \
".type " PV_THUNK_NAME(func) ", @function;" \
+ ASM_FUNC_ALIGN \
PV_THUNK_NAME(func) ":" \
ASM_ENDBR \
FRAME_BEGIN \
@@ -730,6 +726,18 @@ static __always_inline unsigned long arch_local_irq_save(void)
#undef PVOP_VCALL4
#undef PVOP_CALL4
+#define DEFINE_PARAVIRT_ASM(func, instr, sec) \
+ asm (".pushsection " #sec ", \"ax\"\n" \
+ ".global " #func "\n\t" \
+ ".type " #func ", @function\n\t" \
+ ASM_FUNC_ALIGN "\n" \
+ #func ":\n\t" \
+ ASM_ENDBR \
+ instr "\n\t" \
+ ASM_RET \
+ ".size " #func ", . - " #func "\n\t" \
+ ".popsection")
+
extern void default_banner(void);
#else /* __ASSEMBLY__ */
@@ -774,8 +782,7 @@ extern void default_banner(void);
#ifndef __ASSEMBLY__
#ifndef CONFIG_PARAVIRT_XXL
-static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline void paravirt_enter_mmap(struct mm_struct *mm)
{
}
#endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f3d601574730..4acbcddddc29 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -2,36 +2,23 @@
#ifndef _ASM_X86_PARAVIRT_TYPES_H
#define _ASM_X86_PARAVIRT_TYPES_H
-/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_EAX (1 << 0)
-#define CLBR_ECX (1 << 1)
-#define CLBR_EDX (1 << 2)
-#define CLBR_EDI (1 << 3)
-
-#ifdef CONFIG_X86_32
-/* CLBR_ANY should match all regs platform has. For i386, that's just it */
-#define CLBR_ANY ((1 << 4) - 1)
-
-#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
-#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#else
-#define CLBR_RAX CLBR_EAX
-#define CLBR_RCX CLBR_ECX
-#define CLBR_RDX CLBR_EDX
-#define CLBR_RDI CLBR_EDI
-#define CLBR_RSI (1 << 4)
-#define CLBR_R8 (1 << 5)
-#define CLBR_R9 (1 << 6)
-#define CLBR_R10 (1 << 7)
-#define CLBR_R11 (1 << 8)
-
-#define CLBR_ANY ((1 << 9) - 1)
+#ifndef __ASSEMBLY__
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch_site {
+ u8 *instr; /* original instructions */
+ u8 type; /* type of this instruction */
+ u8 len; /* length of original instruction */
+};
-#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
- CLBR_RCX | CLBR_R8 | CLBR_R9)
-#define CLBR_RET_REG (CLBR_RAX)
+/* Lazy mode for batching updates / context switch */
+enum paravirt_lazy_mode {
+ PARAVIRT_LAZY_NONE,
+ PARAVIRT_LAZY_MMU,
+ PARAVIRT_LAZY_CPU,
+};
+#endif
-#endif /* X86_64 */
+#ifdef CONFIG_PARAVIRT
#ifndef __ASSEMBLY__
@@ -177,11 +164,8 @@ struct pv_mmu_ops {
unsigned long (*read_cr3)(void);
void (*write_cr3)(unsigned long);
- /* Hooks for intercepting the creation/use of an mm_struct. */
- void (*activate_mm)(struct mm_struct *prev,
- struct mm_struct *next);
- void (*dup_mmap)(struct mm_struct *oldmm,
- struct mm_struct *mm);
+ /* Hook for intercepting the creation/use of an mm_struct. */
+ void (*enter_mmap)(struct mm_struct *mm);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
@@ -279,27 +263,23 @@ extern struct paravirt_patch_template pv_ops;
#define paravirt_type(op) \
[paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
[paravirt_opptr] "m" (pv_ops.op)
-#define paravirt_clobber(clobber) \
- [paravirt_clobber] "i" (clobber)
-
/*
* Generate some code, and mark it as patchable by the
* apply_paravirt() alternate instruction patcher.
*/
-#define _paravirt_alt(insn_string, type, clobber) \
+#define _paravirt_alt(insn_string, type) \
"771:\n\t" insn_string "\n" "772:\n" \
".pushsection .parainstructions,\"a\"\n" \
_ASM_ALIGN "\n" \
_ASM_PTR " 771b\n" \
" .byte " type "\n" \
" .byte 772b-771b\n" \
- " .short " clobber "\n" \
_ASM_ALIGN "\n" \
".popsection\n"
/* Generate patchable code, with the default asm parameters. */
#define paravirt_alt(insn_string) \
- _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
+ _paravirt_alt(insn_string, "%c[paravirt_typenum]")
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -451,20 +431,19 @@ int paravirt_disable_iospace(void);
})
-#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \
+#define ____PVOP_CALL(ret, op, call_clbr, extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
asm volatile(paravirt_alt(PARAVIRT_CALL) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
- paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
})
-#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, call_clbr, \
extra_clbr, ...) \
({ \
PVOP_CALL_ARGS; \
@@ -473,45 +452,44 @@ int paravirt_disable_iospace(void);
alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
- paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
ret; \
})
#define __PVOP_CALL(rettype, op, ...) \
- ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, \
PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
- ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, \
PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
##__VA_ARGS__)
#define __PVOP_CALLEESAVE(rettype, op, ...) \
- ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, \
PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
- CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_VCALL(op, ...) \
- (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ (void)____PVOP_CALL(, op, PVOP_VCALL_CLOBBERS, \
VEXTRA_CLOBBERS, ##__VA_ARGS__)
#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
- (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, \
PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
##__VA_ARGS__)
#define __PVOP_VCALLEESAVE(op, ...) \
- (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \
+ (void)____PVOP_CALL(, op.func, \
PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
- (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, \
PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
@@ -571,13 +549,6 @@ int paravirt_disable_iospace(void);
__PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-/* Lazy mode for batching updates / context switch */
-enum paravirt_lazy_mode {
- PARAVIRT_LAZY_NONE,
- PARAVIRT_LAZY_MMU,
- PARAVIRT_LAZY_CPU,
-};
-
enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
void paravirt_start_context_switch(struct task_struct *prev);
void paravirt_end_context_switch(struct task_struct *next);
@@ -588,21 +559,20 @@ void paravirt_flush_lazy_mmu(void);
void _paravirt_nop(void);
void paravirt_BUG(void);
-u64 _paravirt_ident_64(u64);
unsigned long paravirt_ret0(void);
+#ifdef CONFIG_PARAVIRT_XXL
+u64 _paravirt_ident_64(u64);
+unsigned long pv_native_save_fl(void);
+void pv_native_irq_disable(void);
+void pv_native_irq_enable(void);
+unsigned long pv_native_read_cr2(void);
+#endif
#define paravirt_nop ((void *)_paravirt_nop)
-/* These all sit in the .parainstructions section to tell us what to patch. */
-struct paravirt_patch_site {
- u8 *instr; /* original instructions */
- u8 type; /* type of this instruction */
- u8 len; /* length of original instruction */
-};
-
extern struct paravirt_patch_site __parainstructions[],
__parainstructions_end[];
#endif /* __ASSEMBLY__ */
-
+#endif /* CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 736793d65bcb..b40c462b4af3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -21,7 +21,7 @@ struct pci_sysdata {
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+#ifdef CONFIG_PCI_MSI
void *fwnode; /* IRQ domain for MSI assignment */
#endif
#if IS_ENABLED(CONFIG_VMD)
@@ -52,7 +52,7 @@ static inline int pci_proc_domain(struct pci_bus *bus)
}
#endif
-#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+#ifdef CONFIG_PCI_MSI
static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
{
return to_pci_sysdata(bus)->fwnode;
@@ -92,6 +92,7 @@ void pcibios_scan_root(int bus);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev);
#define HAVE_PCI_MMAP
#define arch_can_pci_mmap_wc() pat_enabled()
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9ac46dbe57d4..abf09882f58b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -121,6 +121,9 @@
#define PEBS_DATACFG_LBRS BIT_ULL(3)
#define PEBS_DATACFG_LBR_SHIFT 24
+/* Steal the highest bit of pebs_data_cfg for SW usage */
+#define PEBS_UPDATE_DS_SW BIT_ULL(63)
+
/*
* Intel "Architectural Performance Monitoring" CPUID
* detection/enumeration details:
@@ -160,6 +163,14 @@ union cpuid10_edx {
};
/*
+ * Intel "Architectural Performance Monitoring extension" CPUID
+ * detection/enumeration details:
+ */
+#define ARCH_PERFMON_EXT_LEAF 0x00000023
+#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1
+#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
+
+/*
* Intel Architectural LBR CPUID detection/enumeration details:
*/
union cpuid28_eax {
@@ -543,12 +554,12 @@ static inline void perf_check_microcode(void) { }
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
-extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
+extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
#else
struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data);
-static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+static inline void x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
{
- return -1;
+ memset(lbr, 0, sizeof(*lbr));
}
#endif
@@ -578,7 +589,7 @@ extern void perf_amd_brs_lopwr_cb(bool lopwr_in);
DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
-static inline void perf_lopwr_cb(bool lopwr_in)
+static __always_inline void perf_lopwr_cb(bool lopwr_in)
{
static_call_mod(perf_lopwr_cb)(lopwr_in);
}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 60d0f9015317..e9482a11ac52 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -80,21 +80,37 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
return ((value >> rightshift) & mask) << leftshift;
}
-/* Encode and de-code a swap entry */
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * <----------------- offset ------------------> 0 E <- type --> 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
+#define _SWP_TYPE_SHIFT (_PAGE_BIT_PRESENT + 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
+#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
-#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
- & ((1U << SWP_TYPE_BITS) - 1))
+#define __swp_type(x) (((x).val >> _SWP_TYPE_SHIFT) \
+ & _SWP_TYPE_MASK)
#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT)
#define __swp_entry(type, offset) ((swp_entry_t) { \
- ((type) << (_PAGE_BIT_PRESENT + 1)) \
+ (((type) & _SWP_TYPE_MASK) << _SWP_TYPE_SHIFT) \
| ((offset) << SWP_OFFSET_SHIFT) })
#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
/* No inverted PFNs on 2 level page tables */
static inline u64 protnone_mask(u64 val)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 28421a887209..9e7c0b719c3c 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -2,8 +2,6 @@
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
#define _ASM_X86_PGTABLE_3LEVEL_H
-#include <asm/atomic64_32.h>
-
/*
* Intel Physical Address Extension (PAE) Mode - three-level page
* tables on PPro+ CPUs.
@@ -21,7 +19,15 @@
pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
-/* Rules for using set_pte: the pte being assigned *must* be
+#define pxx_xchg64(_pxx, _ptr, _val) ({ \
+ _pxx##val_t *_p = (_pxx##val_t *)_ptr; \
+ _pxx##val_t _o = *_p; \
+ do { } while (!try_cmpxchg64(_p, &_o, (_val))); \
+ native_make_##_pxx(_o); \
+})
+
+/*
+ * Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
* not possible, use pte_get_and_clear to obtain the old pte
@@ -29,75 +35,19 @@
*/
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- ptep->pte_high = pte.pte_high;
+ WRITE_ONCE(ptep->pte_high, pte.pte_high);
smp_wmb();
- ptep->pte_low = pte.pte_low;
-}
-
-#define pmd_read_atomic pmd_read_atomic
-/*
- * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
- * a "*pmdp" dereference done by GCC. Problem is, in certain places
- * where pte_offset_map_lock() is called, concurrent page faults are
- * allowed, if the mmap_lock is hold for reading. An example is mincore
- * vs page faults vs MADV_DONTNEED. On the page fault side
- * pmd_populate() rightfully does a set_64bit(), but if we're reading the
- * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
- * because GCC will not read the 64-bit value of the pmd atomically.
- *
- * To fix this all places running pte_offset_map_lock() while holding the
- * mmap_lock in read mode, shall read the pmdp pointer using this
- * function to know if the pmd is null or not, and in turn to know if
- * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
- * operations.
- *
- * Without THP if the mmap_lock is held for reading, the pmd can only
- * transition from null to not null while pmd_read_atomic() runs. So
- * we can always return atomic pmd values with this function.
- *
- * With THP if the mmap_lock is held for reading, the pmd can become
- * trans_huge or none or point to a pte (and in turn become "stable")
- * at any time under pmd_read_atomic(). We could read it truly
- * atomically here with an atomic64_read() for the THP enabled case (and
- * it would be a whole lot simpler), but to avoid using cmpxchg8b we
- * only return an atomic pmdval if the low part of the pmdval is later
- * found to be stable (i.e. pointing to a pte). We are also returning a
- * 'none' (zero) pmdval if the low part of the pmd is zero.
- *
- * In some cases the high and low part of the pmdval returned may not be
- * consistent if THP is enabled (the low part may point to previously
- * mapped hugepage, while the high part may point to a more recently
- * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
- * needs the low part of the pmd to be read atomically to decide if the
- * pmd is unstable or not, with the only exception when the low part
- * of the pmd is zero, in which case we return a 'none' pmd.
- */
-static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
-{
- pmdval_t ret;
- u32 *tmp = (u32 *)pmdp;
-
- ret = (pmdval_t) (*tmp);
- if (ret) {
- /*
- * If the low part is null, we must not read the high part
- * or we can end up with a partial pmd.
- */
- smp_rmb();
- ret |= ((pmdval_t)*(tmp + 1)) << 32;
- }
-
- return (pmd_t) { ret };
+ WRITE_ONCE(ptep->pte_low, pte.pte_low);
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+ pxx_xchg64(pte, ptep, native_pte_val(pte));
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
+ pxx_xchg64(pmd, pmdp, native_pmd_val(pmd));
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -105,7 +55,7 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
#endif
- set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
+ pxx_xchg64(pud, pudp, native_pud_val(pud));
}
/*
@@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- ptep->pte_low = 0;
+ WRITE_ONCE(ptep->pte_low, 0);
smp_wmb();
- ptep->pte_high = 0;
+ WRITE_ONCE(ptep->pte_high, 0);
}
-static inline void native_pmd_clear(pmd_t *pmd)
+static inline void native_pmd_clear(pmd_t *pmdp)
{
- u32 *tmp = (u32 *)pmd;
- *tmp = 0;
+ WRITE_ONCE(pmdp->pmd_low, 0);
smp_wmb();
- *(tmp + 1) = 0;
+ WRITE_ONCE(pmdp->pmd_high, 0);
}
static inline void native_pud_clear(pud_t *pudp)
@@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp)
*/
}
+
#ifdef CONFIG_SMP
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
- pte_t res;
-
- res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0);
-
- return res;
+ return pxx_xchg64(pte, ptep, 0ULL);
}
-#else
-#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
-#endif
-union split_pmd {
- struct {
- u32 pmd_low;
- u32 pmd_high;
- };
- pmd_t pmd;
-};
-
-#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
{
- union split_pmd res, *orig = (union split_pmd *)pmdp;
-
- /* xchg acts as a barrier before setting of the high bits */
- res.pmd_low = xchg(&orig->pmd_low, 0);
- res.pmd_high = orig->pmd_high;
- orig->pmd_high = 0;
+ return pxx_xchg64(pmd, pmdp, 0ULL);
+}
- return res.pmd;
+static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
+{
+ return pxx_xchg64(pud, pudp, 0ULL);
}
#else
+#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
#ifndef pmdp_establish
@@ -199,57 +133,36 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
* anybody.
*/
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
- union split_pmd old, new, *ptr;
-
- ptr = (union split_pmd *)pmdp;
-
- new.pmd = pmd;
-
/* xchg acts as a barrier before setting of the high bits */
- old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
- old.pmd_high = ptr->pmd_high;
- ptr->pmd_high = new.pmd_high;
- return old.pmd;
- }
-
- do {
- old = *pmdp;
- } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
-
- return old;
-}
-#endif
-
-#ifdef CONFIG_SMP
-union split_pud {
- struct {
- u32 pud_low;
- u32 pud_high;
- };
- pud_t pud;
-};
-
-static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
-{
- union split_pud res, *orig = (union split_pud *)pudp;
+ old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low);
+ old.pmd_high = READ_ONCE(pmdp->pmd_high);
+ WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
- pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
-#endif
-
- /* xchg acts as a barrier before setting of the high bits */
- res.pud_low = xchg(&orig->pud_low, 0);
- res.pud_high = orig->pud_high;
- orig->pud_high = 0;
+ return old;
+ }
- return res.pud;
+ return pxx_xchg64(pmd, pmdp, pmd.pmd);
}
-#else
-#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
-/* Encode and de-code a swap entry */
+/*
+ * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
+ * are !pte_none() && !pte_present().
+ *
+ * Format of swap PTEs:
+ *
+ * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3
+ * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2
+ * < type -> <---------------------- offset ----------------------
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * --------------------------------------------> 0 E 0 0 0 0 0 0 0
+ *
+ * E is the exclusive marker that is not stored in swap entries.
+ */
#define SWP_TYPE_BITS 5
+#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1)
#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
@@ -257,9 +170,10 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
-#define __swp_type(x) (((x).val) & ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK)
#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS)
-#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << SWP_TYPE_BITS})
+#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \
+ | (offset) << SWP_TYPE_BITS})
/*
* Normally, __swp_entry() converts from arch-independent swp_entry_t to
@@ -287,6 +201,9 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
__pteval_swp_offset(pte)))
+/* We borrow bit 7 to store the exclusive marker in swap PTEs. */
+#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE
+
#include <asm/pgtable-invert.h>
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 56baf43befb4..80911349519e 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -18,6 +18,13 @@ typedef union {
};
pteval_t pte;
} pte_t;
+
+typedef union {
+ struct {
+ unsigned long pmd_low, pmd_high;
+ };
+ pmdval_t pmd;
+} pmd_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5059799bebe3..15ae4d6ba476 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -139,6 +139,7 @@ static inline int pmd_dirty(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_DIRTY;
}
+#define pmd_young pmd_young
static inline int pmd_young(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_ACCESSED;
@@ -288,15 +289,36 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+static inline pte_t pte_wrprotect(pte_t pte)
+{
+ return pte_clear_flags(pte, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
- return pte_flags(pte) & _PAGE_UFFD_WP;
+ bool wp = pte_flags(pte) & _PAGE_UFFD_WP;
+
+#ifdef CONFIG_DEBUG_VM
+ /*
+ * Having write bit for wr-protect-marked present ptes is fatal,
+ * because it means the uffd-wp bit will be ignored and write will
+ * just go through.
+ *
+ * Use any chance of pgtable walking to verify this (e.g., when
+ * page swapped out or being migrated for all purposes). It means
+ * something is already wrong. Tell the admin even before the
+ * process crashes. We also nail it with wrong pgtable setup.
+ */
+ WARN_ON_ONCE(wp && pte_write(pte));
+#endif
+
+ return wp;
}
static inline pte_t pte_mkuffd_wp(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_UFFD_WP);
+ return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
}
static inline pte_t pte_clear_uffd_wp(pte_t pte)
@@ -315,11 +337,6 @@ static inline pte_t pte_mkold(pte_t pte)
return pte_clear_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_wrprotect(pte_t pte)
-{
- return pte_clear_flags(pte, _PAGE_RW);
-}
-
static inline pte_t pte_mkexec(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_NX);
@@ -384,6 +401,11 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pmd_uffd_wp(pmd_t pmd)
{
@@ -392,7 +414,7 @@ static inline int pmd_uffd_wp(pmd_t pmd)
static inline pmd_t pmd_mkuffd_wp(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_UFFD_WP);
+ return pmd_wrprotect(pmd_set_flags(pmd, _PAGE_UFFD_WP));
}
static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd)
@@ -411,11 +433,6 @@ static inline pmd_t pmd_mkclean(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_DIRTY);
}
-static inline pmd_t pmd_wrprotect(pmd_t pmd)
-{
- return pmd_clear_flags(pmd, _PAGE_RW);
-}
-
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
@@ -1080,7 +1097,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}
-#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
+#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
@@ -1282,8 +1299,6 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
}
-#ifdef _PAGE_SWP_EXCLUSIVE
-#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
@@ -1298,7 +1313,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
}
-#endif /* _PAGE_SWP_EXCLUSIVE */
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
@@ -1438,6 +1452,14 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#ifdef CONFIG_XEN_PV
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
+}
+#endif
+
#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 7c9c968a42ef..7d4ad8907297 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -48,15 +48,6 @@ do { \
#endif /* !__ASSEMBLY__ */
/*
- * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM
- */
-#ifdef CONFIG_FLATMEM
-#define kern_addr_valid(addr) (1)
-#else
-#define kern_addr_valid(kaddr) (0)
-#endif
-
-/*
* This is used to calculate the .brk reservation for initial pagetables.
* Enough space is reserved to allocate pagetables sufficient to cover all
* of LOWMEM_PAGES, which is an upper bound on the size of the direct map of
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e479491da8d5..7929327abe00 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd)
#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
#define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val })
-extern int kern_addr_valid(unsigned long addr);
extern void cleanup_highmap(void);
#define HAVE_ARCH_UNMAPPED_AREA
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 04f36063ad54..38b54b992f32 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -19,6 +19,7 @@ typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
+typedef struct { pmdval_t pmd; } pmd_t;
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;
@@ -103,7 +104,7 @@ extern unsigned int ptrs_per_p4d;
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
/*
- * See Documentation/x86/x86_64/mm.rst for a description of the memory map.
+ * See Documentation/arch/x86/x86_64/mm.rst for a description of the memory map.
*
* Be very careful vs. KASLR when changing anything here. The KASLR address
* range must not overlap with anything except the KASAN shadow area, which
diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h
index d34cce1b995c..4f056fb88174 100644
--- a/arch/x86/include/asm/pgtable_areas.h
+++ b/arch/x86/include/asm/pgtable_areas.h
@@ -11,6 +11,12 @@
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
-#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
+#ifdef CONFIG_X86_32
+#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \
+ (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \
+ CPU_ENTRY_AREA_BASE)
+#else
+#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE
+#endif
#endif /* _ASM_X86_PGTABLE_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index aa174fed3a71..447d4bee25c4 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -361,11 +361,9 @@ static inline pudval_t native_pud_val(pud_t pud)
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-typedef struct { pmdval_t pmd; } pmd_t;
-
static inline pmd_t native_make_pmd(pmdval_t val)
{
- return (pmd_t) { val };
+ return (pmd_t) { .pmd = val };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 5f6daea1ee24..2d13f25b1bd8 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -4,11 +4,11 @@
#include <asm/rmwcc.h>
#include <asm/percpu.h>
+#include <asm/current.h>
+
#include <linux/thread_info.h>
#include <linux/static_call_types.h>
-DECLARE_PER_CPU(int, __preempt_count);
-
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
@@ -24,7 +24,7 @@ DECLARE_PER_CPU(int, __preempt_count);
*/
static __always_inline int preempt_count(void)
{
- return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+ return raw_cpu_read_4(pcpu_hot.preempt_count) & ~PREEMPT_NEED_RESCHED;
}
static __always_inline void preempt_count_set(int pc)
@@ -32,10 +32,10 @@ static __always_inline void preempt_count_set(int pc)
int old, new;
do {
- old = raw_cpu_read_4(__preempt_count);
+ old = raw_cpu_read_4(pcpu_hot.preempt_count);
new = (old & PREEMPT_NEED_RESCHED) |
(pc & ~PREEMPT_NEED_RESCHED);
- } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
+ } while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
}
/*
@@ -44,7 +44,7 @@ static __always_inline void preempt_count_set(int pc)
#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- per_cpu(__preempt_count, (cpu)) = PREEMPT_DISABLED; \
+ per_cpu(pcpu_hot.preempt_count, (cpu)) = PREEMPT_DISABLED; \
} while (0)
/*
@@ -58,17 +58,17 @@ static __always_inline void preempt_count_set(int pc)
static __always_inline void set_preempt_need_resched(void)
{
- raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+ raw_cpu_and_4(pcpu_hot.preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
- raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+ raw_cpu_or_4(pcpu_hot.preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
- return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+ return !(raw_cpu_read_4(pcpu_hot.preempt_count) & PREEMPT_NEED_RESCHED);
}
/*
@@ -77,12 +77,12 @@ static __always_inline bool test_preempt_need_resched(void)
static __always_inline void __preempt_count_add(int val)
{
- raw_cpu_add_4(__preempt_count, val);
+ raw_cpu_add_4(pcpu_hot.preempt_count, val);
}
static __always_inline void __preempt_count_sub(int val)
{
- raw_cpu_add_4(__preempt_count, -val);
+ raw_cpu_add_4(pcpu_hot.preempt_count, -val);
}
/*
@@ -92,7 +92,8 @@ static __always_inline void __preempt_count_sub(int val)
*/
static __always_inline bool __preempt_count_dec_and_test(void)
{
- return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var]));
+ return GEN_UNARY_RMWcc("decl", pcpu_hot.preempt_count, e,
+ __percpu_arg([var]));
}
/*
@@ -100,7 +101,7 @@ static __always_inline bool __preempt_count_dec_and_test(void)
*/
static __always_inline bool should_resched(int preempt_offset)
{
- return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
+ return unlikely(raw_cpu_read_4(pcpu_hot.preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPTION
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 02c2cbda4a74..d8cccadc83a6 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -28,6 +28,8 @@
* On systems with SME, one bit (in a variable position!) is stolen to indicate
* that the top-level paging structure is encrypted.
*
+ * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode.
+ *
* All of the remaining bits indicate the physical address of the top-level
* paging structure.
*
@@ -35,7 +37,7 @@
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID and SME encryption bits. */
-#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
+#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 67c9d73b31fa..a1e4fa58b357 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -16,6 +16,7 @@ struct vm86;
#include <uapi/asm/sigcontext.h>
#include <asm/current.h>
#include <asm/cpufeatures.h>
+#include <asm/cpuid.h>
#include <asm/page.h>
#include <asm/pgtable_types.h>
#include <asm/percpu.h>
@@ -146,17 +147,6 @@ struct cpuinfo_x86 {
unsigned initialized : 1;
} __randomize_layout;
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
-
-enum cpuid_regs_idx {
- CPUID_EAX = 0,
- CPUID_EBX,
- CPUID_ECX,
- CPUID_EDX,
-};
-
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
@@ -205,45 +195,6 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
-#ifdef CONFIG_X86_32
-extern int have_cpuid_p(void);
-#else
-static inline int have_cpuid_p(void)
-{
- return 1;
-}
-#endif
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- asm volatile("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx)
- : "memory");
-}
-
-#define native_cpuid_reg(reg) \
-static inline unsigned int native_cpuid_##reg(unsigned int op) \
-{ \
- unsigned int eax = op, ebx, ecx = 0, edx; \
- \
- native_cpuid(&eax, &ebx, &ecx, &edx); \
- \
- return reg; \
-}
-
-/*
- * Native CPUID functions returning a single datum.
- */
-native_cpuid_reg(eax)
-native_cpuid_reg(ebx)
-native_cpuid_reg(ecx)
-native_cpuid_reg(edx)
-
/*
* Friendlier CR3 helpers.
*/
@@ -426,8 +377,6 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
/*
@@ -450,8 +399,6 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
}
-DECLARE_PER_CPU(void *, hardirq_stack_ptr);
-DECLARE_PER_CPU(bool, hardirq_stack_inuse);
extern asmlinkage void ignore_sysret(void);
/* Save actual FS/GS selectors and bases to current->thread */
@@ -460,8 +407,6 @@ void current_save_fsgs(void);
#ifdef CONFIG_STACKPROTECTOR
DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
-DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
#endif /* !X86_64 */
struct perf_event;
@@ -566,7 +511,7 @@ static __always_inline unsigned long current_top_of_stack(void)
* and around vm86 mode and sp0 on x86_64 is special because of the
* entry trampoline.
*/
- return this_cpu_read_stable(cpu_current_top_of_stack);
+ return this_cpu_read_stable(pcpu_hot.top_of_stack);
}
static __always_inline bool on_thread_stack(void)
@@ -578,7 +523,6 @@ static __always_inline bool on_thread_stack(void)
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
-#define __cpuid native_cpuid
static inline void load_sp0(unsigned long sp0)
{
@@ -589,69 +533,6 @@ static inline void load_sp0(unsigned long sp0)
unsigned long __get_wchan(struct task_struct *p);
-/*
- * Generic CPUID function
- * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
- * resulting in stale register contents being returned.
- */
-static inline void cpuid(unsigned int op,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = 0;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/* Some CPUID calls want 'count' to be placed in ecx */
-static inline void cpuid_count(unsigned int op, int count,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = count;
- __cpuid(eax, ebx, ecx, edx);
-}
-
-/*
- * CPUID functions returning a single datum
- */
-static inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return eax;
-}
-
-static inline unsigned int cpuid_ebx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ebx;
-}
-
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return ecx;
-}
-
-static inline unsigned int cpuid_edx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
-
- return edx;
-}
-
extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void amd_e400_c1e_apic_setup(void);
@@ -661,16 +542,14 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL};
extern void enable_sep_cpu(void);
-extern int sysenter_setup(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
-extern void switch_to_new_gdt(int);
+extern void switch_gdt_and_percpu_base(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
-extern void load_percpu_segment(int);
extern void cpu_init(void);
extern void cpu_init_secondary(void);
extern void cpu_init_exception_handling(void);
@@ -768,7 +647,11 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD { \
+ .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}
extern unsigned long KSTK_ESP(struct task_struct *task);
@@ -805,24 +688,6 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; }
static inline u32 amd_get_highest_perf(void) { return 0; }
#endif
-#define for_each_possible_hypervisor_cpuid_base(function) \
- for (function = 0x40000000; function < 0x40010000; function += 0x100)
-
-static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
-{
- uint32_t base, eax, signature[3];
-
- for_each_possible_hypervisor_cpuid_base(base) {
- cpuid(base, &eax, &signature[0], &signature[1], &signature[2]);
-
- if (!memcmp(sig, signature, 12) &&
- (leaves == 0 || ((eax - base) >= leaves)))
- return base;
- }
-
- return 0;
-}
-
extern unsigned long arch_align_stack(unsigned long sp);
void free_init_pages(const char *what, unsigned long begin, unsigned long end);
extern void free_kernel_image_pages(const char *what, void *begin, void *end);
@@ -835,7 +700,8 @@ bool xen_set_default_idle(void);
#endif
void __noreturn stop_this_cpu(void *dummy);
-void microcode_check(void);
+void microcode_check(struct cpuinfo_x86 *prev_info);
+void store_cpu_caps(struct cpuinfo_x86 *info);
enum l1tf_mitigations {
L1TF_MITIGATION_OFF,
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 19b695ff2c68..0c92db84469d 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -7,6 +7,7 @@
/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
@@ -39,7 +40,7 @@ bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
*/
-static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+static __always_inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
{
u64 product;
#ifdef __i386__
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 60ece592b220..42b17cf10b10 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -14,8 +14,6 @@
__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
#define __pv_queued_spin_unlock __pv_queued_spin_unlock
-#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
-#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
/*
* Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
@@ -37,32 +35,27 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text");
* rsi = lockval (second argument)
* rdx = internal variable (set to 0)
*/
-asm (".pushsection .spinlock.text;"
- ".globl " PV_UNLOCK ";"
- ".type " PV_UNLOCK ", @function;"
- ".align 4,0x90;"
- PV_UNLOCK ": "
- ASM_ENDBR
- FRAME_BEGIN
- "push %rdx;"
- "mov $0x1,%eax;"
- "xor %edx,%edx;"
- LOCK_PREFIX "cmpxchg %dl,(%rdi);"
- "cmp $0x1,%al;"
- "jne .slowpath;"
- "pop %rdx;"
+#define PV_UNLOCK_ASM \
+ FRAME_BEGIN \
+ "push %rdx\n\t" \
+ "mov $0x1,%eax\n\t" \
+ "xor %edx,%edx\n\t" \
+ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \
+ "cmp $0x1,%al\n\t" \
+ "jne .slowpath\n\t" \
+ "pop %rdx\n\t" \
+ FRAME_END \
+ ASM_RET \
+ ".slowpath:\n\t" \
+ "push %rsi\n\t" \
+ "movzbl %al,%esi\n\t" \
+ "call __raw_callee_save___pv_queued_spin_unlock_slowpath\n\t" \
+ "pop %rsi\n\t" \
+ "pop %rdx\n\t" \
FRAME_END
- ASM_RET
- ".slowpath: "
- "push %rsi;"
- "movzbl %al,%esi;"
- "call " PV_UNLOCK_SLOWPATH ";"
- "pop %rsi;"
- "pop %rdx;"
- FRAME_END
- ASM_RET
- ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
- ".popsection");
+
+DEFINE_PARAVIRT_ASM(__raw_callee_save___pv_queued_spin_unlock,
+ PV_UNLOCK_ASM, .spinlock.text);
#else /* CONFIG_64BIT */
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fd6f6e5b755a..f6a1737c77be 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -59,7 +59,6 @@ extern struct real_mode_header *real_mode_header;
extern unsigned char real_mode_blob_end[];
extern unsigned long initial_code;
-extern unsigned long initial_gs;
extern unsigned long initial_stack;
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern unsigned long initial_vc_handler;
@@ -91,6 +90,7 @@ static inline void set_real_mode_mem(phys_addr_t mem)
void reserve_real_mode(void);
void load_trampoline_pgtable(void);
+void init_real_mode(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..9177b4354c3f 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,8 +25,9 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
-void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
void run_crash_ipi_callback(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index aff774775c67..7ba1726b71c7 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -98,6 +98,7 @@
#define REQUIRED_MASK17 0
#define REQUIRED_MASK18 0
#define REQUIRED_MASK19 0
-#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
+#define REQUIRED_MASK20 0
+#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index d24b04ebf950..255a78d9d906 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -7,8 +7,6 @@
#include <linux/sched.h>
#include <linux/jump_label.h>
-#define IA32_PQR_ASSOC 0x0c8f
-
/**
* struct resctrl_pqr_state - State cache for the PQR MSR
* @cur_rmid: The cached Resource Monitoring ID
@@ -16,8 +14,8 @@
* @default_rmid: The user assigned Resource Monitoring ID
* @default_closid: The user assigned cached Class Of Service ID
*
- * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
* contains both parts, so we need to cache them. This also
* stores the user configured per cpu CLOSID and RMID.
*
@@ -51,7 +49,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* simple as possible.
* Must be called with preemption disabled.
*/
-static void __resctrl_sched_in(void)
+static inline void __resctrl_sched_in(struct task_struct *tsk)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
@@ -63,13 +61,13 @@ static void __resctrl_sched_in(void)
* Else use the closid/rmid assigned to this cpu.
*/
if (static_branch_likely(&rdt_alloc_enable_key)) {
- tmp = READ_ONCE(current->closid);
+ tmp = READ_ONCE(tsk->closid);
if (tmp)
closid = tmp;
}
if (static_branch_likely(&rdt_mon_enable_key)) {
- tmp = READ_ONCE(current->rmid);
+ tmp = READ_ONCE(tsk->rmid);
if (tmp)
rmid = tmp;
}
@@ -77,7 +75,7 @@ static void __resctrl_sched_in(void)
if (closid != state->cur_closid || rmid != state->cur_rmid) {
state->cur_closid = closid;
state->cur_rmid = rmid;
- wrmsr(IA32_PQR_ASSOC, rmid, closid);
+ wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid);
}
}
@@ -90,17 +88,17 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
return val * scale;
}
-static inline void resctrl_sched_in(void)
+static inline void resctrl_sched_in(struct task_struct *tsk)
{
if (static_branch_likely(&rdt_enable_key))
- __resctrl_sched_in();
+ __resctrl_sched_in(tsk);
}
void resctrl_cpu_detect(struct cpuinfo_x86 *c);
#else
-static inline void resctrl_sched_in(void) {}
+static inline void resctrl_sched_in(struct task_struct *tsk) {}
static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {}
#endif /* CONFIG_X86_CPU_RESCTRL */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 2e7890dd58a4..794f69625780 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -96,7 +96,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - unused
+ * 28 - VDSO getcpu
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -119,6 +119,7 @@
#define GDT_ENTRY_ESPFIX_SS 26
#define GDT_ENTRY_PERCPU 27
+#define GDT_ENTRY_CPUNODE 28
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
@@ -135,6 +136,7 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __USER32_CS __USER_CS
#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
/* segment for calling fn: */
@@ -158,6 +160,8 @@
# define __KERNEL_PERCPU 0
#endif
+#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
+
#else /* 64-bit: */
#include <asm/cache.h>
@@ -210,7 +214,6 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
-#define __USER32_DS __USER_DS
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3)
@@ -226,8 +229,6 @@
#define GDT_ENTRY_TLS_ENTRIES 3
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
-#ifdef CONFIG_X86_64
-
/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
#define VDSO_CPUNODE_BITS 12
#define VDSO_CPUNODE_MASK 0xfff
@@ -265,7 +266,6 @@ static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node)
}
#endif /* !__ASSEMBLY__ */
-#endif /* CONFIG_X86_64 */
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index b45c4d27fd46..a5e89641bd2d 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -6,6 +6,9 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#define set_memory_rox set_memory_rox
+int set_memory_rox(unsigned long addr, int numpages);
+
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f37cbff7354c..f3495623ac99 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -125,11 +125,11 @@ void clear_bss(void);
#ifdef __i386__
-asmlinkage void __init i386_start_kernel(void);
+asmlinkage void __init __noreturn i386_start_kernel(void);
#else
-asmlinkage void __init x86_64_start_kernel(char *real_mode);
-asmlinkage void __init x86_64_start_reservations(char *real_mode_data);
+asmlinkage void __init __noreturn x86_64_start_kernel(char *real_mode);
+asmlinkage void __init __noreturn x86_64_start_reservations(char *real_mode_data);
#endif /* __i386__ */
#endif /* _SETUP */
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b8357d6ecd47..0759af9b1acf 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -128,9 +128,6 @@ struct snp_psc_desc {
struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
} __packed;
-/* Guest message request error code */
-#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32)
-
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ebc271bb6d8e..13dc2a9d23c1 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -9,6 +9,8 @@
#define __ASM_ENCRYPTED_STATE_H
#include <linux/types.h>
+#include <linux/sev-guest.h>
+
#include <asm/insn.h>
#include <asm/sev-common.h>
#include <asm/bootparam.h>
@@ -185,6 +187,9 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
return rc;
}
+
+struct snp_guest_request_ioctl;
+
void setup_ghcb(void);
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned int npages);
@@ -196,7 +201,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
@@ -216,8 +221,7 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
-static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input,
- unsigned long *fw_err)
+static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
}
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index eae20fa52b93..6a0069761508 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -115,17 +115,36 @@ enum sgx_miscselect {
* %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
* sign cryptographic tokens that can be passed to
* EINIT as an authorization to run an enclave.
+ * %SGX_ATTR_ASYNC_EXIT_NOTIFY: Allow enclaves to be notified after an
+ * asynchronous exit has occurred.
*/
enum sgx_attribute {
- SGX_ATTR_INIT = BIT(0),
- SGX_ATTR_DEBUG = BIT(1),
- SGX_ATTR_MODE64BIT = BIT(2),
- SGX_ATTR_PROVISIONKEY = BIT(4),
- SGX_ATTR_EINITTOKENKEY = BIT(5),
- SGX_ATTR_KSS = BIT(7),
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ /* BIT(3) is reserved */
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ /* BIT(6) is for CET */
+ SGX_ATTR_KSS = BIT(7),
+ /* BIT(8) is reserved */
+ /* BIT(9) is reserved */
+ SGX_ATTR_ASYNC_EXIT_NOTIFY = BIT(10),
};
-#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | \
+ BIT_ULL(6) | \
+ BIT_ULL(8) | \
+ BIT_ULL(9) | \
+ GENMASK_ULL(63, 11))
+
+#define SGX_ATTR_UNPRIV_MASK (SGX_ATTR_DEBUG | \
+ SGX_ATTR_MODE64BIT | \
+ SGX_ATTR_KSS | \
+ SGX_ATTR_ASYNC_EXIT_NOTIFY)
+
+#define SGX_ATTR_PRIV_MASK (SGX_ATTR_PROVISIONKEY | \
+ SGX_ATTR_EINITTOKENKEY)
/**
* struct sgx_secs - SGX Enclave Control Structure (SECS)
diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h
index c0ef921c0586..8009d781c2f9 100644
--- a/arch/x86/include/asm/shared/io.h
+++ b/arch/x86/include/asm/shared/io.h
@@ -5,13 +5,13 @@
#include <linux/types.h>
#define BUILDIO(bwl, bw, type) \
-static inline void __out##bwl(type value, u16 port) \
+static __always_inline void __out##bwl(type value, u16 port) \
{ \
asm volatile("out" #bwl " %" #bw "0, %w1" \
: : "a"(value), "Nd"(port)); \
} \
\
-static inline type __in##bwl(u16 port) \
+static __always_inline type __in##bwl(u16 port) \
{ \
type value; \
asm volatile("in" #bwl " %w1, %" #bw "0" \
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index e53f26228fbb..2631e01f6e0f 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -7,9 +7,6 @@
#define TDX_HYPERCALL_STANDARD 0
-#define TDX_HCALL_HAS_OUTPUT BIT(0)
-#define TDX_HCALL_ISSUE_STI BIT(1)
-
#define TDX_CPUID_LEAF_ID 0x21
#define TDX_IDENT "IntelTDX "
@@ -22,16 +19,23 @@
* This is a software only structure and not part of the TDX module/VMM ABI.
*/
struct tdx_hypercall_args {
+ u64 r8;
+ u64 r9;
u64 r10;
u64 r11;
u64 r12;
u64 r13;
u64 r14;
u64 r15;
+ u64 rdi;
+ u64 rsi;
+ u64 rbx;
+ u64 rdx;
};
/* Used to request services from the VMM */
-u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags);
+u64 __tdx_hypercall(struct tdx_hypercall_args *args);
+u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args);
/* Called from __tdx_hypercall() for unrecoverable failure */
void __tdx_hypercall_failed(void);
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 65e667279e0f..e770c4fc47f4 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -15,4 +15,13 @@
void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
+ void __user **fpstate);
+
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs);
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
+
#endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 2dfb5fea13af..4a4043ca6493 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -28,11 +28,6 @@ typedef struct {
#define SA_IA32_ABI 0x02000000u
#define SA_X32_ABI 0x01000000u
-#ifndef CONFIG_COMPAT
-#define compat_sigset_t compat_sigset_t
-typedef sigset_t compat_sigset_t;
-#endif
-
#endif /* __ASSEMBLY__ */
#include <uapi/asm/signal.h>
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a73bced40e24..4e91054c84be 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -3,10 +3,10 @@
#define _ASM_X86_SMP_H
#ifndef __ASSEMBLY__
#include <linux/cpumask.h>
-#include <asm/percpu.h>
-#include <asm/thread_info.h>
#include <asm/cpumask.h>
+#include <asm/current.h>
+#include <asm/thread_info.h>
extern int smp_num_siblings;
extern unsigned int num_processors;
@@ -19,7 +19,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
@@ -94,12 +93,13 @@ static inline void __cpu_die(unsigned int cpu)
smp_ops.cpu_die(cpu);
}
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
smp_ops.play_dead();
+ BUG();
}
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
{
smp_ops.smp_send_reschedule(cpu);
}
@@ -125,7 +125,7 @@ int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void);
int common_cpu_die(unsigned int cpu);
void native_cpu_die(unsigned int cpu);
-void hlt_play_dead(void);
+void __noreturn hlt_play_dead(void);
void native_play_dead(void);
void play_dead_common(void);
void wbinvd_on_cpu(int cpu);
@@ -150,11 +150,10 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r);
/*
* This function is needed by all SMP systems. It must _always_ be valid
- * from the initial startup. We map APIC_BASE very early in page_setup(),
- * so this is correct in the x86 case.
+ * from the initial startup.
*/
-#define raw_smp_processor_id() this_cpu_read(cpu_number)
-#define __smp_processor_id() __this_cpu_read(cpu_number)
+#define raw_smp_processor_id() this_cpu_read(pcpu_hot.cpu_number)
+#define __smp_processor_id() __this_cpu_read(pcpu_hot.cpu_number)
#ifdef CONFIG_X86_32
extern int safe_smp_processor_id(void);
@@ -201,5 +200,8 @@ extern void nmi_selftest(void);
#define nmi_selftest() do { } while (0)
#endif
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 35f709f619fb..de48d1389936 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -115,22 +115,11 @@ static inline void wrpkru(u32 pkru)
}
#endif
-static inline void native_wbinvd(void)
+static __always_inline void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
}
-extern asmlinkage void asm_load_gs_index(unsigned int selector);
-
-static inline void native_load_gs_index(unsigned int selector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- asm_load_gs_index(selector);
- local_irq_restore(flags);
-}
-
static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
@@ -179,24 +168,14 @@ static inline void __write_cr4(unsigned long x)
native_write_cr4(x);
}
-static inline void wbinvd(void)
+static __always_inline void wbinvd(void)
{
native_wbinvd();
}
-
-static inline void load_gs_index(unsigned int selector)
-{
-#ifdef CONFIG_X86_64
- native_load_gs_index(selector);
-#else
- loadsegment(gs, selector);
-#endif
-}
-
#endif /* CONFIG_PARAVIRT_XXL */
-static inline void clflush(volatile void *__p)
+static __always_inline void clflush(volatile void *__p)
{
asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
}
@@ -295,7 +274,7 @@ static inline int enqcmds(void __iomem *dst, const void *src)
return 0;
}
-static inline void tile_release(void)
+static __always_inline void tile_release(void)
{
/*
* Instruction opcode for TILERELEASE; supported in binutils
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 24a8d6c4fb18..00473a650f51 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -34,7 +34,6 @@
#include <asm/percpu.h>
#include <asm/desc.h>
-#include <linux/random.h>
#include <linux/sched.h>
/*
@@ -50,22 +49,11 @@
*/
static __always_inline void boot_init_stack_canary(void)
{
- u64 canary;
- u64 tsc;
+ unsigned long canary = get_random_canary();
#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
#endif
- /*
- * We both use the random pool and the current TSC as a source
- * of randomness. The TSC only matters for very early init,
- * there it already has some randomness on most systems. Later
- * on during the bootup the random pool has true entropy too.
- */
- get_random_bytes(&canary, sizeof(canary));
- tsc = rdtsc();
- canary += tsc + (tsc << 32UL);
- canary &= CANARY_MASK;
current->stack_canary = canary;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 888731ccf1f6..857d364b9888 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -15,24 +15,18 @@
#endif
#define __HAVE_ARCH_MEMCPY 1
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-#undef memcpy
-#define memcpy __msan_memcpy
-#else
extern void *memcpy(void *to, const void *from, size_t len);
-#endif
extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-extern void *__msan_memset(void *s, int c, size_t n);
-#undef memset
-#define memset __msan_memset
-#else
void *memset(void *s, int c, size_t n);
-#endif
void *__memset(void *s, int c, size_t n);
+/*
+ * KMSAN needs to instrument as much code as possible. Use C versions of
+ * memsetXX() from lib/string.c under KMSAN.
+ */
+#if !defined(CONFIG_KMSAN)
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
@@ -68,15 +62,10 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
: "memory");
return s;
}
+#endif
#define __HAVE_ARCH_MEMMOVE
-#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY)
-#undef memmove
-void *__msan_memmove(void *dest, const void *src, size_t len);
-#define memmove __msan_memmove
-#else
void *memmove(void *dest, const void *src, size_t count);
-#endif
void *__memmove(void *dest, const void *src, size_t count);
int memcmp(const void *cs, const void *ct, size_t count);
@@ -85,25 +74,6 @@ char *strcpy(char *dest, const char *src);
char *strcat(char *dest, const char *src);
int strcmp(const char *cs, const char *ct);
-#if (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__))
-/*
- * For files that not instrumented (e.g. mm/slub.c) we
- * should use not instrumented version of mem* functions.
- */
-
-#undef memcpy
-#define memcpy(dst, src, len) __memcpy(dst, src, len)
-#undef memmove
-#define memmove(dst, src, len) __memmove(dst, src, len)
-#undef memset
-#define memset(s, c, n) __memset(s, c, n)
-
-#ifndef __NO_FORTIFY
-#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */
-#endif
-
-#endif
-
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
#define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1
void __memcpy_flushcache(void *dst, const void *src, size_t cnt);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0361626841bc..e7c7379d6ac7 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -5,6 +5,8 @@
#include <uapi/asm/svm.h>
#include <uapi/asm/kvm.h>
+#include <asm/hyperv-tlfs.h>
+
/*
* 32-bit intercept words in the VMCB Control Area, starting
* at Byte offset 000h.
@@ -161,7 +163,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
*/
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
@@ -178,6 +183,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_SHIFT 9
#define V_GIF_MASK (1 << V_GIF_SHIFT)
+#define V_NMI_PENDING_SHIFT 11
+#define V_NMI_PENDING_MASK (1 << V_NMI_PENDING_SHIFT)
+
+#define V_NMI_BLOCKING_SHIFT 12
+#define V_NMI_BLOCKING_MASK (1 << V_NMI_BLOCKING_SHIFT)
+
#define V_INTR_PRIO_SHIFT 16
#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
@@ -192,6 +203,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define V_GIF_ENABLE_SHIFT 25
#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+#define V_NMI_ENABLE_SHIFT 26
+#define V_NMI_ENABLE_MASK (1 << V_NMI_ENABLE_SHIFT)
+
#define AVIC_ENABLE_SHIFT 31
#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
@@ -256,22 +270,23 @@ enum avic_ipi_failure_cause {
AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
};
-#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(9, 0)
+#define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0)
/*
- * For AVIC, the max index allowed for physical APIC ID
- * table is 0xff (255).
+ * For AVIC, the max index allowed for physical APIC ID table is 0xfe (254), as
+ * 0xff is a broadcast to all CPUs, i.e. can't be targeted individually.
*/
#define AVIC_MAX_PHYSICAL_ID 0XFEULL
/*
- * For x2AVIC, the max index allowed for physical APIC ID
- * table is 0x1ff (511).
+ * For x2AVIC, the max index allowed for physical APIC ID table is 0x1ff (511).
*/
#define X2AVIC_MAX_PHYSICAL_ID 0x1FFUL
+static_assert((AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == AVIC_MAX_PHYSICAL_ID);
+static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_MAX_PHYSICAL_ID);
+
#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF)
-#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
struct vmcb_seg {
@@ -293,12 +308,13 @@ struct vmcb_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u8 reserved_1[42];
+ /* Reserved fields are named following their struct offset */
+ u8 reserved_0xa0[42];
u8 vmpl;
u8 cpl;
- u8 reserved_2[4];
+ u8 reserved_0xcc[4];
u64 efer;
- u8 reserved_3[112];
+ u8 reserved_0xd8[112];
u64 cr4;
u64 cr3;
u64 cr0;
@@ -306,7 +322,7 @@ struct vmcb_save_area {
u64 dr6;
u64 rflags;
u64 rip;
- u8 reserved_4[88];
+ u8 reserved_0x180[88];
u64 rsp;
u64 s_cet;
u64 ssp;
@@ -321,14 +337,14 @@ struct vmcb_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_5[32];
+ u8 reserved_0x248[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
- u8 reserved_6[72];
+ u8 reserved_0x298[72];
u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */
} __packed;
@@ -349,12 +365,12 @@ struct sev_es_save_area {
u64 vmpl2_ssp;
u64 vmpl3_ssp;
u64 u_cet;
- u8 reserved_1[2];
+ u8 reserved_0xc8[2];
u8 vmpl;
u8 cpl;
- u8 reserved_2[4];
+ u8 reserved_0xcc[4];
u64 efer;
- u8 reserved_3[104];
+ u8 reserved_0xd8[104];
u64 xss;
u64 cr4;
u64 cr3;
@@ -371,7 +387,7 @@ struct sev_es_save_area {
u64 dr1_addr_mask;
u64 dr2_addr_mask;
u64 dr3_addr_mask;
- u8 reserved_4[24];
+ u8 reserved_0x1c0[24];
u64 rsp;
u64 s_cet;
u64 ssp;
@@ -386,21 +402,21 @@ struct sev_es_save_area {
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
- u8 reserved_5[32];
+ u8 reserved_0x248[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
- u8 reserved_7[80];
+ u8 reserved_0x298[80];
u32 pkru;
- u8 reserved_8[20];
- u64 reserved_9; /* rax already available at 0x01f8 */
+ u32 tsc_aux;
+ u8 reserved_0x2f0[24];
u64 rcx;
u64 rdx;
u64 rbx;
- u64 reserved_10; /* rsp already available at 0x01d8 */
+ u64 reserved_0x320; /* rsp already available at 0x01d8 */
u64 rbp;
u64 rsi;
u64 rdi;
@@ -412,7 +428,7 @@ struct sev_es_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_11[16];
+ u8 reserved_0x380[16];
u64 guest_exit_info_1;
u64 guest_exit_info_2;
u64 guest_exit_int_info;
@@ -425,7 +441,7 @@ struct sev_es_save_area {
u64 pcpu_id;
u64 event_inj;
u64 xcr0;
- u8 reserved_12[16];
+ u8 reserved_0x3f0[16];
/* Floating point area */
u64 x87_dp;
@@ -443,23 +459,23 @@ struct sev_es_save_area {
} __packed;
struct ghcb_save_area {
- u8 reserved_1[203];
+ u8 reserved_0x0[203];
u8 cpl;
- u8 reserved_2[116];
+ u8 reserved_0xcc[116];
u64 xss;
- u8 reserved_3[24];
+ u8 reserved_0x148[24];
u64 dr7;
- u8 reserved_4[16];
+ u8 reserved_0x168[16];
u64 rip;
- u8 reserved_5[88];
+ u8 reserved_0x180[88];
u64 rsp;
- u8 reserved_6[24];
+ u8 reserved_0x1e0[24];
u64 rax;
- u8 reserved_7[264];
+ u8 reserved_0x200[264];
u64 rcx;
u64 rdx;
u64 rbx;
- u8 reserved_8[8];
+ u8 reserved_0x320[8];
u64 rbp;
u64 rsi;
u64 rdi;
@@ -471,12 +487,12 @@ struct ghcb_save_area {
u64 r13;
u64 r14;
u64 r15;
- u8 reserved_9[16];
+ u8 reserved_0x380[16];
u64 sw_exit_code;
u64 sw_exit_info_1;
u64 sw_exit_info_2;
u64 sw_scratch;
- u8 reserved_10[56];
+ u8 reserved_0x3b0[56];
u64 xcr0;
u8 valid_bitmap[16];
u64 x87_state_gpa;
@@ -490,7 +506,7 @@ struct ghcb {
u8 shared_buffer[GHCB_SHARED_BUF_SIZE];
- u8 reserved_1[10];
+ u8 reserved_0xff0[10];
u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
u32 ghcb_usage;
} __packed;
@@ -502,6 +518,9 @@ struct ghcb {
#define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024
#define EXPECTED_GHCB_SIZE PAGE_SIZE
+#define BUILD_BUG_RESERVED_OFFSET(x, y) \
+ ASSERT_STRUCT_OFFSET(struct x, reserved ## _ ## y, y)
+
static inline void __unused_size_checks(void)
{
BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
@@ -509,6 +528,39 @@ static inline void __unused_size_checks(void)
BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
+
+ /* Check offsets of reserved fields */
+
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xa0);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(vmcb_save_area, 0x298);
+
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xc8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0xd8);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x1c0);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x248);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x298);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x2f0);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(sev_es_save_area, 0x3f0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0xcc);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x148);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x168);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x180);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x1e0);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x200);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x320);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x380);
+ BUILD_BUG_RESERVED_OFFSET(ghcb_save_area, 0x3b0);
+
+ BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
}
struct vmcb {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index c08eb0fdd11f..5c91305d09d2 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -66,13 +66,10 @@ static inline void update_task_stack(struct task_struct *task)
{
/* sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32
- if (static_cpu_has(X86_FEATURE_XENPV))
- load_sp0(task->thread.sp0);
- else
- this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+ this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
/* Xen PV enters the kernel on the thread stack. */
- if (static_cpu_has(X86_FEATURE_XENPV))
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
}
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 020c81a7c729..28d889c9aa16 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -67,6 +67,8 @@ void tdx_safe_halt(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
+int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
+
#else
static inline void tdx_early_init(void) { };
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 1cc15528ce29..29832c338cdc 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern void *text_poke_copy(void *addr, const void *opcode, size_t len);
+extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok);
extern void *text_poke_set(void *addr, int c, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
@@ -183,6 +184,37 @@ void int3_emulate_ret(struct pt_regs *regs)
unsigned long ip = int3_emulate_pop(regs);
int3_emulate_jmp(regs, ip);
}
+
+static __always_inline
+void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp)
+{
+ static const unsigned long jcc_mask[6] = {
+ [0] = X86_EFLAGS_OF,
+ [1] = X86_EFLAGS_CF,
+ [2] = X86_EFLAGS_ZF,
+ [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+ [4] = X86_EFLAGS_SF,
+ [5] = X86_EFLAGS_PF,
+ };
+
+ bool invert = cc & 1;
+ bool match;
+
+ if (cc < 0xc) {
+ match = regs->flags & jcc_mask[cc >> 1];
+ } else {
+ match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+ if (cc >= 0xe)
+ match = match || (regs->flags & X86_EFLAGS_ZF);
+ }
+
+ if ((match && !invert) || (!match && invert))
+ ip += disp;
+
+ int3_emulate_jmp(regs, ip);
+}
+
#endif /* !CONFIG_UML_X86 */
#endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0cb881c1d69..f1cccba52eb9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -163,7 +163,12 @@ struct thread_info {
* GOOD_FRAME if within a frame
* BAD_STACK if placed across a frame boundary (or outside stack)
* NOT_STACK unable to determine (no frame pointers, etc)
+ *
+ * This function reads pointers from the stack and dereferences them. The
+ * pointers may not have their KMSAN shadow set up properly, which may result
+ * in false positive reports. Disable instrumentation to avoid those.
*/
+__no_kmsan_checks
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
const void *obj, unsigned long len)
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 8ac563abb567..a53961c64a56 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
+extern bool tsc_clocksource_watchdog_disabled(void);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cda3118f3b27..75bfaa421030 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_TLBFLUSH_H
#define _ASM_X86_TLBFLUSH_H
-#include <linux/mm.h>
+#include <linux/mm_types.h>
#include <linux/sched.h>
#include <asm/processor.h>
@@ -12,6 +12,7 @@
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
void __flush_tlb_all(void);
@@ -53,6 +54,15 @@ static inline void cr4_clear_bits(unsigned long mask)
local_irq_restore(flags);
}
+#ifdef CONFIG_ADDRESS_MASKING
+DECLARE_PER_CPU(u64, tlbstate_untag_mask);
+
+static inline u64 current_untag_mask(void)
+{
+ return this_cpu_read(tlbstate_untag_mask);
+}
+#endif
+
#ifndef MODULE
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
@@ -101,6 +111,16 @@ struct tlb_state {
*/
bool invalidate_other;
+#ifdef CONFIG_ADDRESS_MASKING
+ /*
+ * Active LAM mode.
+ *
+ * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
+ * disabled.
+ */
+ u8 lam;
+#endif
+
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
@@ -357,6 +377,32 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#define huge_pmd_needs_flush huge_pmd_needs_flush
+#ifdef CONFIG_ADDRESS_MASKING
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ u64 lam = this_cpu_read(cpu_tlbstate.lam);
+
+ return lam << X86_CR3_LAM_U57_BIT;
+}
+
+static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+{
+ this_cpu_write(cpu_tlbstate.lam,
+ mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+}
+
+#else
+
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ return 0;
+}
+
+static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+{
+}
+#endif
#endif /* !MODULE */
static inline void __native_tlb_flush_global(unsigned long cr4)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1cc756eafa44..8bae40a66282 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -7,43 +7,21 @@
#include <linux/compiler.h>
#include <linux/instrumented.h>
#include <linux/kasan-checks.h>
+#include <linux/mm_types.h>
#include <linux/string.h>
+#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
+#include <asm/tlbflush.h>
-#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-static inline bool pagefault_disabled(void);
-# define WARN_ON_IN_IRQ() \
- WARN_ON_ONCE(!in_task() && !pagefault_disabled())
+#ifdef CONFIG_X86_32
+# include <asm/uaccess_32.h>
#else
-# define WARN_ON_IN_IRQ()
+# include <asm/uaccess_64.h>
#endif
-/**
- * access_ok - Checks if a user space pointer is valid
- * @addr: User space pointer to start of block to check
- * @size: Size of block to check
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Checks if a pointer to a block of memory in user space is valid.
- *
- * Note that, depending on architecture, this function probably just
- * checks that the pointer is in the user space range - after calling
- * this function, memory access functions may still return -EFAULT.
- *
- * Return: true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- */
-#define access_ok(addr, size) \
-({ \
- WARN_ON_IN_IRQ(); \
- likely(__access_ok(addr, size)); \
-})
-
#include <asm-generic/access_ok.h>
extern int __get_user_1(void);
@@ -532,14 +510,6 @@ extern struct movsl_mask {
#define ARCH_HAS_NOCACHE_UACCESS 1
-#ifdef CONFIG_X86_32
-unsigned long __must_check clear_user(void __user *mem, unsigned long len);
-unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
-# include <asm/uaccess_32.h>
-#else
-# include <asm/uaccess_64.h>
-#endif
-
/*
* The "unsafe" user accesses aren't really "unsafe", but the naming
* is a big fat warning: you have to not only do the access_ok()
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 388a40660c7b..40379a1adbb8 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -33,4 +33,7 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
return __copy_from_user_ll_nocache_nozero(to, from, n);
}
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
+
#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index d13d71af5cf6..81b826d3b753 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -12,38 +12,113 @@
#include <asm/cpufeatures.h>
#include <asm/page.h>
+#ifdef CONFIG_ADDRESS_MASKING
+/*
+ * Mask out tag bits from the address.
+ */
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+ /*
+ * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
+ * in alternative instructions. The relocation gets wrong when gets
+ * copied to the target place.
+ */
+ asm (ALTERNATIVE("",
+ "and %%gs:tlbstate_untag_mask, %[addr]\n\t", X86_FEATURE_LAM)
+ : [addr] "+r" (addr) : "m" (tlbstate_untag_mask));
+
+ return addr;
+}
+
+#define untagged_addr(addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr(__addr); \
+})
+
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+ unsigned long addr)
+{
+ mmap_assert_locked(mm);
+ return addr & (mm)->context.untag_mask;
+}
+
+#define untagged_addr_remote(mm, addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \
+})
+
+#endif
+
+/*
+ * The virtual address space space is logically divided into a kernel
+ * half and a user half. When cast to a signed type, user pointers
+ * are positive and kernel pointers are negative.
+ */
+#define valid_user_address(x) ((long)(x) >= 0)
+
+/*
+ * User pointers can have tag bits on x86-64. This scheme tolerates
+ * arbitrary values in those bits rather then masking them off.
+ *
+ * Enforce two rules:
+ * 1. 'ptr' must be in the user half of the address space
+ * 2. 'ptr+size' must not overflow into kernel addresses
+ *
+ * Note that addresses around the sign change are not valid addresses,
+ * and will GP-fault even with LAM enabled if the sign bit is set (see
+ * "CR3.LAM_SUP" that can narrow the canonicality check if we ever
+ * enable it, but not remove it entirely).
+ *
+ * So the "overflow into kernel addresses" does not imply some sudden
+ * exact boundary at the sign bit, and we can allow a lot of slop on the
+ * size check.
+ *
+ * In fact, we could probably remove the size check entirely, since
+ * any kernel accesses will be in increasing address order starting
+ * at 'ptr', and even if the end might be in kernel space, we'll
+ * hit the GP faults for non-canonical accesses before we ever get
+ * there.
+ *
+ * That's a separate optimization, for now just handle the small
+ * constant case.
+ */
+static inline bool __access_ok(const void __user *ptr, unsigned long size)
+{
+ if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) {
+ return valid_user_address(ptr);
+ } else {
+ unsigned long sum = size + (unsigned long)ptr;
+ return valid_user_address(sum) && sum >= (unsigned long)ptr;
+ }
+}
+#define __access_ok __access_ok
+
/*
* Copy To/From Userspace
*/
/* Handles exceptions in both to and from, but doesn't do access_ok */
__must_check unsigned long
-copy_user_enhanced_fast_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_user_generic_string(void *to, const void *from, unsigned len);
-__must_check unsigned long
-copy_user_generic_unrolled(void *to, const void *from, unsigned len);
+rep_movs_alternative(void *to, const void *from, unsigned len);
static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned len)
+copy_user_generic(void *to, const void *from, unsigned long len)
{
- unsigned ret;
-
+ stac();
/*
- * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
- * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
- * Otherwise, use copy_user_generic_unrolled.
+ * If CPU has FSRM feature, use 'rep movs'.
+ * Otherwise, use rep_movs_alternative.
*/
- alternative_call_2(copy_user_generic_unrolled,
- copy_user_generic_string,
- X86_FEATURE_REP_GOOD,
- copy_user_enhanced_fast_string,
- X86_FEATURE_ERMS,
- ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
- "=d" (len)),
- "1" (to), "2" (from), "3" (len)
- : "memory", "rcx", "r8", "r9", "r10", "r11");
- return ret;
+ asm volatile(
+ "1:\n\t"
+ ALTERNATIVE("rep movsb",
+ "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+ : : "memory", "rax", "r8", "r9", "r10", "r11");
+ clac();
+ return len;
}
static __always_inline __must_check unsigned long
@@ -58,19 +133,19 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size);
}
-extern long __copy_user_nocache(void *dst, const void __user *src,
- unsigned size, int zerorest);
-
+extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
-extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
- size_t len);
static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
unsigned size)
{
+ long ret;
kasan_check_write(dst, size);
- return __copy_user_nocache(dst, src, size, 0);
+ stac();
+ ret = __copy_user_nocache(dst, src, size);
+ clac();
+ return ret;
}
static inline int
@@ -85,11 +160,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/
__must_check unsigned long
-clear_user_original(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_rep_good(void __user *addr, unsigned long len);
-__must_check unsigned long
-clear_user_erms(void __user *addr, unsigned long len);
+rep_stos_alternative(void __user *addr, unsigned long len);
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
@@ -102,16 +173,12 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
*/
asm volatile(
"1:\n\t"
- ALTERNATIVE_3("rep stosb",
- "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM),
- "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
- "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+ ALTERNATIVE("rep stosb",
+ "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
- : "a" (0)
- /* rep_good clobbers %rdx */
- : "rdx");
+ : "a" (0));
clac();
@@ -120,7 +187,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
{
- if (access_ok(to, n))
+ if (__access_ok(to, n))
return __clear_user(to, n);
return n;
}
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index f66fbe6537dd..01cb9692b160 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -7,15 +7,20 @@
#ifdef __ASSEMBLY__
-.macro UNWIND_HINT_EMPTY
- UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1
+.macro UNWIND_HINT_END_OF_STACK
+ UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK
+.endm
+
+.macro UNWIND_HINT_UNDEFINED
+ UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED
.endm
.macro UNWIND_HINT_ENTRY
- UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_END_OF_STACK
.endm
-.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0
+.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
.if \base == %rsp
.if \indirect
.set sp_reg, ORC_REG_SP_INDIRECT
@@ -45,11 +50,16 @@
.set type, UNWIND_HINT_TYPE_REGS
.endif
- UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type
+ UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type signal=\signal
+.endm
+
+.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 signal=1
+ UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
.endm
-.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0
- UNWIND_HINT_REGS base=\base offset=\offset partial=1
+.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal
.endm
.macro UNWIND_HINT_FUNC
@@ -67,7 +77,7 @@
#else
#define UNWIND_HINT_FUNC \
- UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0)
+ UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0)
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 2963a2f5dbc4..d7f6592b74a9 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -45,7 +45,7 @@ extern const struct vdso_image vdso_image_x32;
extern const struct vdso_image vdso_image_32;
#endif
-extern void __init init_vdso_image(const struct vdso_image *image);
+extern int __init init_vdso_image(const struct vdso_image *image);
extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index 1936f21ed8cd..4cf6794f9d68 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -318,6 +318,8 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
}
#define vdso_calc_delta vdso_calc_delta
+int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts);
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/x86/include/asm/vdso/processor.h b/arch/x86/include/asm/vdso/processor.h
index 57b1a7034c64..2cbce97d29ea 100644
--- a/arch/x86/include/asm/vdso/processor.h
+++ b/arch/x86/include/asm/vdso/processor.h
@@ -18,6 +18,10 @@ static __always_inline void cpu_relax(void)
rep_nop();
}
+struct getcpu_cache;
+
+notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused);
+
#endif /* __ASSEMBLY__ */
#endif /* __ASM_VDSO_PROCESSOR_H */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 8757078d4442..3b12e6b99412 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void)
wrmsrl(MSR_VM_HSAVE_PA, 0);
rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM to ensure INIT and NMI
+ * aren't blocked, e.g. if a fatal error occurred between CLGI
+ * and STGI. Note, STGI may #UD if SVM is disabled from NMI
+ * context between reading EFER and executing STGI. In that
+ * case, GIF must already be set, otherwise the NMI would have
+ * been blocked, so just eat the fault.
+ */
+ asm_volatile_goto("1: stgi\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "memory" : fault);
+fault:
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
}
/** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498dc600bd5c..0d02c4aafa6f 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -13,7 +13,9 @@
#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/types.h>
+
#include <uapi/asm/vmx.h>
#include <asm/vmxfeatures.h>
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index e9170457697e..88085f369ff6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -259,11 +259,15 @@ struct x86_legacy_features {
* VMMCALL under SEV-ES. Needs to return 'false'
* if the checks fail. Called from the #VC
* exception handler.
+ * @is_private_mmio: For CoCo VMs, must map MMIO address as private.
+ * Used when device is emulated by a paravisor
+ * layer in the VM context.
*/
struct x86_hyper_runtime {
void (*pin_vcpu)(int cpu);
void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs);
bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);
+ bool (*is_private_mmio)(u64 addr);
};
/**
@@ -285,6 +289,8 @@ struct x86_hyper_runtime {
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
+ * @realmode_reserve: reserve memory for realmode trampoline
+ * @realmode_init: initialize realmode trampoline
* @hyper: x86 hypervisor specific runtime callbacks
*/
struct x86_platform_ops {
@@ -301,6 +307,8 @@ struct x86_platform_ops {
void (*apic_post_init)(void);
struct x86_legacy_features legacy;
void (*set_legacy_features)(void);
+ void (*realmode_reserve)(void);
+ void (*realmode_init)(void);
struct x86_hyper_runtime hyper;
struct x86_guest guest;
};
@@ -322,5 +330,7 @@ extern void x86_init_uint_noop(unsigned int unused);
extern bool bool_x86_init_noop(void);
extern void x86_op_int_noop(int cpu);
extern bool x86_pnpbios_disabled(void);
+extern int set_rtc_noop(const struct timespec64 *now);
+extern void get_rtc_noop(struct timespec64 *now);
#endif
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 6daa9b0c8d11..a3c29b1496c8 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -89,11 +89,21 @@
* Sub-leaf 2: EAX: host tsc frequency in kHz
*/
+#define XEN_CPUID_TSC_EMULATED (1u << 0)
+#define XEN_CPUID_HOST_TSC_RELIABLE (1u << 1)
+#define XEN_CPUID_RDTSCP_INSTR_AVAIL (1u << 2)
+
+#define XEN_CPUID_TSC_MODE_DEFAULT (0)
+#define XEN_CPUID_TSC_MODE_ALWAYS_EMULATE (1u)
+#define XEN_CPUID_TSC_MODE_NEVER_EMULATE (2u)
+#define XEN_CPUID_TSC_MODE_PVRDTSCP (3u)
+
/*
* Leaf 5 (0x40000x04)
* HVM-specific features
* Sub-leaf 0: EAX: Features
* Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: ECX: domain id (iff EAX has XEN_HVM_CPUID_DOMID_PRESENT flag)
*/
#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
@@ -102,12 +112,16 @@
#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
#define XEN_HVM_CPUID_DOMID_PRESENT (1u << 4) /* domid is present in ECX */
/*
- * Bits 55:49 from the IO-APIC RTE and bits 11:5 from the MSI address can be
- * used to store high bits for the Destination ID. This expands the Destination
- * ID field from 8 to 15 bits, allowing to target APIC IDs up 32768.
+ * With interrupt format set to 0 (non-remappable) bits 55:49 from the
+ * IO-APIC RTE and bits 11:5 from the MSI address can be used to store
+ * high bits for the Destination ID. This expands the Destination ID
+ * field from 8 to 15 bits, allowing to target APIC IDs up 32768.
*/
#define XEN_HVM_CPUID_EXT_DEST_ID (1u << 5)
-/* Per-vCPU event channel upcalls */
+/*
+ * Per-vCPU event channel upcalls work correctly with physical IRQs
+ * bound to event channels.
+ */
#define XEN_HVM_CPUID_UPCALL_VECTOR (1u << 6)
/*
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index e5e0fe10c692..a2dd24947eb8 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -382,7 +382,7 @@ MULTI_stack_switch(struct multicall_entry *mcl,
}
#endif
-static inline int
+static __always_inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
return _hypercall2(int, sched_op, cmd, arg);
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 16f548a661cf..5fc35f889cd1 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -38,9 +38,11 @@ extern struct start_info *xen_start_info;
#include <asm/processor.h>
+#define XEN_SIGNATURE "XenVMMXenVMM"
+
static inline uint32_t xen_cpuid_base(void)
{
- return hypervisor_cpuid_base("XenVMMXenVMM", 2);
+ return hypervisor_cpuid_base(XEN_SIGNATURE, 2);
}
struct pci_dev;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 46de10a809ec..1a6a1f987949 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/stddef.h>
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -53,14 +54,6 @@
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
-struct kvm_memory_alias {
- __u32 slot; /* this has a different namespace than memory slots */
- __u32 flags;
- __u64 guest_phys_addr;
- __u64 memory_size;
- __u64 target_phys_addr;
-};
-
/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
struct kvm_pic_state {
__u8 last_irr; /* edge detection */
@@ -214,6 +207,8 @@ struct kvm_msr_list {
struct kvm_msr_filter_range {
#define KVM_MSR_FILTER_READ (1 << 0)
#define KVM_MSR_FILTER_WRITE (1 << 1)
+#define KVM_MSR_FILTER_RANGE_VALID_MASK (KVM_MSR_FILTER_READ | \
+ KVM_MSR_FILTER_WRITE)
__u32 flags;
__u32 nmsrs; /* number of msrs in bitmap */
__u32 base; /* MSR index the bitmap starts at */
@@ -222,8 +217,11 @@ struct kvm_msr_filter_range {
#define KVM_MSR_FILTER_MAX_RANGES 16
struct kvm_msr_filter {
+#ifndef __KERNEL__
#define KVM_MSR_FILTER_DEFAULT_ALLOW (0 << 0)
+#endif
#define KVM_MSR_FILTER_DEFAULT_DENY (1 << 0)
+#define KVM_MSR_FILTER_VALID_MASK (KVM_MSR_FILTER_DEFAULT_DENY)
__u32 flags;
struct kvm_msr_filter_range ranges[KVM_MSR_FILTER_MAX_RANGES];
};
@@ -510,8 +508,8 @@ struct kvm_nested_state {
* KVM_{GET,PUT}_NESTED_STATE ioctl values.
*/
union {
- struct kvm_vmx_nested_state_data vmx[0];
- struct kvm_svm_nested_state_data svm[0];
+ __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx);
+ __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm);
} data;
};
@@ -528,8 +526,40 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+
+/*
+ * Masked event layout.
+ * Bits Description
+ * ---- -----------
+ * 7:0 event select (low bits)
+ * 15:8 umask match
+ * 31:16 unused
+ * 35:32 event select (high bits)
+ * 36:54 unused
+ * 55 exclude bit
+ * 63:56 umask mask
+ */
+
+#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \
+ (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \
+ (((mask) & 0xFFULL) << 56) | \
+ (((match) & 0xFFULL) << 8) | \
+ ((__u64)(!!(exclude)) << 55))
+
+#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
+ (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
+
/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
+/* x86-specific KVM_EXIT_HYPERCALL flags. */
+#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 500b96e71f18..e8d7ebbca1a4 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -16,8 +16,16 @@
#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+#define ARCH_XCOMP_TILECFG 17
+#define ARCH_XCOMP_TILEDATA 18
+
#define ARCH_MAP_VDSO_X32 0x2001
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+#define ARCH_GET_UNTAG_MASK 0x4001
+#define ARCH_ENABLE_TAGGED_ADDR 0x4002
+#define ARCH_GET_MAX_TAG_BITS 0x4003
+#define ARCH_FORCE_TAGGED_SVA 0x4004
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index c47cc7f2feeb..d898432947ff 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -82,6 +82,10 @@
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */
+#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT)
+#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */
+#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT)
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
@@ -132,6 +136,8 @@
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
#define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */
#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */
+#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index f69c168391aa..80e1df482337 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -116,6 +116,12 @@
#define SVM_VMGEXIT_AP_CREATE 1
#define SVM_VMGEXIT_AP_DESTROY 2
#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd
+#define SVM_VMGEXIT_TERM_REQUEST 0x8000fffe
+#define SVM_VMGEXIT_TERM_REASON(reason_set, reason_code) \
+ /* SW_EXITINFO1[3:0] */ \
+ (((((u64)reason_set) & 0xf)) | \
+ /* SW_EXITINFO1[11:4] */ \
+ ((((u64)reason_code) & 0xff) << 4))
#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
/* Exit code reserved for hypervisor/software use */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f901658d9f7c..4070a01c11b7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,6 +17,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_sev.o = -pg
+CFLAGS_REMOVE_rethook.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -44,8 +45,7 @@ obj-y += head_$(BITS).o
obj-y += head$(BITS).o
obj-y += ebda.o
obj-y += platform-quirks.o
-obj-y += process_$(BITS).o signal.o
-obj-$(CONFIG_COMPAT) += signal_compat.o
+obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
@@ -54,7 +54,7 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += sys_ia32.o
-obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o
+obj-$(CONFIG_IA32_EMULATION) += sys_ia32.o signal_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
@@ -143,6 +143,8 @@ obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
+obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 907cc98b1938..21b542a6866c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -146,7 +146,11 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
pr_debug("Local APIC address 0x%08x\n", madt->address);
}
- if (madt->header.revision >= 5)
+
+ /* ACPI 6.3 and newer support the online capable bit. */
+ if (acpi_gbl_FADT.header.revision > 6 ||
+ (acpi_gbl_FADT.header.revision == 6 &&
+ acpi_gbl_FADT.minor_revision >= 3))
acpi_support_online_capable = true;
default_acpi_madt_oem_check(madt->header.oem_id,
@@ -188,6 +192,18 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return cpu;
}
+static bool __init acpi_is_processor_usable(u32 lapic_flags)
+{
+ if (lapic_flags & ACPI_MADT_ENABLED)
+ return true;
+
+ if (!acpi_support_online_capable ||
+ (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ return true;
+
+ return false;
+}
+
static int __init
acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
{
@@ -212,6 +228,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
if (apic_id == 0xffffffff)
return 0;
+ /* don't register processors that cannot be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -250,9 +270,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
return 0;
/* don't register processors that can not be onlined */
- if (acpi_support_online_capable &&
- !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
- !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
+ if (!acpi_is_processor_usable(processor->lapic_flags))
return 0;
/*
@@ -1841,22 +1859,27 @@ early_param("acpi_sci", setup_acpi_sci);
int __acpi_acquire_global_lock(unsigned int *lock)
{
unsigned int old, new, val;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
- new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
- return ((new & 0x3) < 3) ? -1 : 0;
+ val = (old >> 1) & 0x1;
+ new = (old & ~0x3) + 2 + val;
+ } while (!try_cmpxchg(lock, &old, new));
+
+ if (val)
+ return 0;
+
+ return -1;
}
int __acpi_release_global_lock(unsigned int *lock)
{
- unsigned int old, new, val;
+ unsigned int old, new;
+
+ old = READ_ONCE(*lock);
do {
- old = *lock;
new = old & ~0x3;
- val = cmpxchg(lock, old, new);
- } while (unlikely (val != old));
+ } while (!try_cmpxchg(lock, &old, new));
return old & 0x1;
}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 7945eae5b315..401808b47af3 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -52,17 +52,25 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
if (c->x86_vendor == X86_VENDOR_INTEL &&
(c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
flags->bm_control = 0;
- /*
- * For all recent Centaur CPUs, the ucode will make sure that each
- * core can keep cache coherence with each other while entering C3
- * type state. So, set bm_check to 1 to indicate that the kernel
- * doesn't need to execute a cache flush operation (WBINVD) when
- * entering C3 type state.
- */
+
if (c->x86_vendor == X86_VENDOR_CENTAUR) {
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f &&
- c->x86_stepping >= 0x0e))
+ c->x86_stepping >= 0x0e)) {
+ /*
+ * For all recent Centaur CPUs, the ucode will make sure that each
+ * core can keep cache coherence with each other while entering C3
+ * type state. So, set bm_check to 1 to indicate that the kernel
+ * doesn't need to execute a cache flush operation (WBINVD) when
+ * entering C3 type state.
+ */
flags->bm_check = 1;
+ /*
+ * For all recent Centaur platforms, ARB_DISABLE is a nop.
+ * Set bm_control to zero to indicate that ARB_DISABLE is
+ * not required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..1328c221af30 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,26 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
- early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_rw(smp_processor_id());
- initial_gs = per_cpu_offset(smp_processor_id());
+ /*
+ * As each CPU starts up, it will find its own stack pointer
+ * from its current_task->thread.sp. Typically that will be
+ * the idle thread for a newly-started AP, or even the boot
+ * CPU which will find it set to &init_task in the static
+ * per-cpu data.
+ *
+ * Make the resuming CPU use the temporary stack at startup
+ * by setting current->thread.sp to point to that. The true
+ * %rsp will be restored with the rest of the CPU context,
+ * by do_suspend_lowlevel(). And unwinders don't care about
+ * the abuse of ->thread.sp because it's a dead variable
+ * while the thread is running on the CPU anyway; the true
+ * value is in the actual %rsp register.
+ */
+ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0L;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
/*
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cadcea035e0..f615e0cb6d93 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -116,6 +116,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
+extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
@@ -281,27 +282,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
- /* Mask away "NOT" flag bit for feature to test. */
- u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
- BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
+ BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
/*
* Patch if either:
* - feature is present
- * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+ * - feature not present but ALT_FLAG_NOT is set to mean,
* patch if feature is *NOT* present.
*/
- if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
goto next;
DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
- feature >> 5,
- feature & 0x1f,
+ (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
instr, instr, a->instrlen,
replacement, a->replacementlen);
@@ -339,6 +338,12 @@ next:
}
}
+static inline bool is_jcc32(struct insn *insn)
+{
+ /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
+ return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
+}
+
#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
@@ -377,6 +382,50 @@ static int emit_indirect(int op, int reg, u8 *bytes)
return i;
}
+static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
+{
+ u8 op = insn->opcode.bytes[0];
+ int i = 0;
+
+ /*
+ * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
+ * tail-calls. Deal with them.
+ */
+ if (is_jcc32(insn)) {
+ bytes[i++] = op;
+ op = insn->opcode.bytes[1];
+ goto clang_jcc;
+ }
+
+ if (insn->length == 6)
+ bytes[i++] = 0x2e; /* CS-prefix */
+
+ switch (op) {
+ case CALL_INSN_OPCODE:
+ __text_gen_insn(bytes+i, op, addr+i,
+ __x86_indirect_call_thunk_array[reg],
+ CALL_INSN_SIZE);
+ i += CALL_INSN_SIZE;
+ break;
+
+ case JMP32_INSN_OPCODE:
+clang_jcc:
+ __text_gen_insn(bytes+i, op, addr+i,
+ __x86_indirect_jump_thunk_array[reg],
+ JMP32_INSN_SIZE);
+ i += JMP32_INSN_SIZE;
+ break;
+
+ default:
+ WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
+ return -1;
+ }
+
+ WARN_ON_ONCE(i != insn->length);
+
+ return i;
+}
+
/*
* Rewrite the compiler generated retpoline thunk calls.
*
@@ -409,8 +458,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
BUG_ON(reg == 4);
if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
- !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE))
+ !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return emit_call_track_retpoline(addr, insn, reg, bytes);
+
return -1;
+ }
op = insn->opcode.bytes[0];
@@ -427,8 +480,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
* [ NOP ]
* 1:
*/
- /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
- if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) {
+ if (is_jcc32(insn)) {
cc = insn->opcode.bytes[1] & 0xf;
cc ^= 1; /* invert condition */
@@ -518,6 +570,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
}
#ifdef CONFIG_RETHUNK
+
+#ifdef CONFIG_CALL_THUNKS
+void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
+#endif
+
/*
* Rewrite the compiler generated return thunk tail-calls.
*
@@ -533,14 +590,18 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
{
int i = 0;
- if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- return -1;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
+ if (x86_return_thunk == __x86_return_thunk)
+ return -1;
- bytes[i++] = RET_INSN_OPCODE;
+ i = JMP32_INSN_SIZE;
+ __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
+ } else {
+ bytes[i++] = RET_INSN_OPCODE;
+ }
for (; i < insn->length;)
bytes[i++] = INT3_INSN_OPCODE;
-
return i;
}
@@ -594,6 +655,28 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
#ifdef CONFIG_X86_KERNEL_IBT
+static void poison_endbr(void *addr, bool warn)
+{
+ u32 endbr, poison = gen_endbr_poison();
+
+ if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ return;
+
+ if (!is_endbr(endbr)) {
+ WARN_ON_ONCE(warn);
+ return;
+ }
+
+ DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+
+ /*
+ * When we have IBT, the lack of ENDBR will trigger #CP
+ */
+ DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
+ DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+ text_poke_early(addr, &poison, 4);
+}
+
/*
* Generated by: objtool --ibt
*/
@@ -602,31 +685,391 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
s32 *s;
for (s = start; s < end; s++) {
- u32 endbr, poison = gen_endbr_poison();
void *addr = (void *)s + *s;
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ poison_endbr(addr, true);
+ if (IS_ENABLED(CONFIG_FINEIBT))
+ poison_endbr(addr - 16, false);
+ }
+}
+
+#else
+
+void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
+
+#endif /* CONFIG_X86_KERNEL_IBT */
+
+#ifdef CONFIG_FINEIBT
+
+enum cfi_mode {
+ CFI_DEFAULT,
+ CFI_OFF,
+ CFI_KCFI,
+ CFI_FINEIBT,
+};
+
+static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
+static bool cfi_rand __ro_after_init = true;
+static u32 cfi_seed __ro_after_init;
+
+/*
+ * Re-hash the CFI hash with a boot-time seed while making sure the result is
+ * not a valid ENDBR instruction.
+ */
+static u32 cfi_rehash(u32 hash)
+{
+ hash ^= cfi_seed;
+ while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
+ bool lsb = hash & 1;
+ hash >>= 1;
+ if (lsb)
+ hash ^= 0x80200003;
+ }
+ return hash;
+}
+
+static __init int cfi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ while (str) {
+ char *next = strchr(str, ',');
+ if (next) {
+ *next = 0;
+ next++;
+ }
+
+ if (!strcmp(str, "auto")) {
+ cfi_mode = CFI_DEFAULT;
+ } else if (!strcmp(str, "off")) {
+ cfi_mode = CFI_OFF;
+ cfi_rand = false;
+ } else if (!strcmp(str, "kcfi")) {
+ cfi_mode = CFI_KCFI;
+ } else if (!strcmp(str, "fineibt")) {
+ cfi_mode = CFI_FINEIBT;
+ } else if (!strcmp(str, "norand")) {
+ cfi_rand = false;
+ } else {
+ pr_err("Ignoring unknown cfi option (%s).", str);
+ }
+
+ str = next;
+ }
+
+ return 0;
+}
+early_param("cfi", cfi_parse_cmdline);
+
+/*
+ * kCFI FineIBT
+ *
+ * __cfi_\func: __cfi_\func:
+ * movl $0x12345678,%eax // 5 endbr64 // 4
+ * nop subl $0x12345678,%r10d // 7
+ * nop jz 1f // 2
+ * nop ud2 // 2
+ * nop 1: nop // 1
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ * nop
+ *
+ *
+ * caller: caller:
+ * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6
+ * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4
+ * je 1f // 2 nop4 // 4
+ * ud2 // 2
+ * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5
+ *
+ */
+
+asm( ".pushsection .rodata \n"
+ "fineibt_preamble_start: \n"
+ " endbr64 \n"
+ " subl $0x12345678, %r10d \n"
+ " je fineibt_preamble_end \n"
+ " ud2 \n"
+ " nop \n"
+ "fineibt_preamble_end: \n"
+ ".popsection\n"
+);
+
+extern u8 fineibt_preamble_start[];
+extern u8 fineibt_preamble_end[];
+
+#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
+#define fineibt_preamble_hash 7
+
+asm( ".pushsection .rodata \n"
+ "fineibt_caller_start: \n"
+ " movl $0x12345678, %r10d \n"
+ " sub $16, %r11 \n"
+ ASM_NOP4
+ "fineibt_caller_end: \n"
+ ".popsection \n"
+);
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
+#define fineibt_caller_hash 2
+
+#define fineibt_caller_jmp (fineibt_caller_size - 2)
+
+static u32 decode_preamble_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* b8 78 56 34 12 mov $0x12345678,%eax */
+ if (p[0] == 0xb8)
+ return *(u32 *)(addr + 1);
+
+ return 0; /* invalid hash value */
+}
+
+static u32 decode_caller_hash(void *addr)
+{
+ u8 *p = addr;
+
+ /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */
+ if (p[0] == 0x41 && p[1] == 0xba)
+ return -*(u32 *)(addr + 2);
+
+ /* e8 0c 78 56 34 12 jmp.d8 +12 */
+ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
+ return -*(u32 *)(addr + 2);
+
+ return 0; /* invalid hash value */
+}
+
+/* .retpoline_sites */
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
+ * in tact for later usage. Also see decode_caller_hash() and
+ * cfi_rewrite_callers().
+ */
+ const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
continue;
- if (WARN_ON_ONCE(!is_endbr(endbr)))
+ text_poke_early(addr, jmp, 2);
+ }
+
+ return 0;
+}
+
+static int cfi_enable_callers(s32 *start, s32 *end)
+{
+ /*
+ * Re-enable kCFI, undo what cfi_disable_callers() did.
+ */
+ const u8 mov[] = { 0x41, 0xba };
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (!hash) /* nocfi callers */
continue;
- DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+ text_poke_early(addr, mov, 2);
+ }
- /*
- * When we have IBT, the lack of ENDBR will trigger #CP
- */
- DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
- DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(addr, &poison, 4);
+ return 0;
+}
+
+/* .cfi_sites */
+static int cfi_rand_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ hash = cfi_rehash(hash);
+ text_poke_early(addr + 1, &hash, 4);
+ }
+
+ return 0;
+}
+
+static int cfi_rewrite_preamble(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ hash = decode_preamble_hash(addr);
+ if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
+ addr, addr, 5, addr))
+ return -EINVAL;
+
+ text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+ }
+
+ return 0;
+}
+
+/* .retpoline_sites */
+static int cfi_rand_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ hash = -cfi_rehash(hash);
+ text_poke_early(addr + 2, &hash, 4);
+ }
+ }
+
+ return 0;
+}
+
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++) {
+ void *addr = (void *)s + *s;
+ u32 hash;
+
+ addr -= fineibt_caller_size;
+ hash = decode_caller_hash(addr);
+ if (hash) {
+ text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ }
+ /* rely on apply_retpolines() */
+ }
+
+ return 0;
+}
+
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+ int ret;
+
+ if (WARN_ONCE(fineibt_preamble_size != 16,
+ "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
+ return;
+
+ if (cfi_mode == CFI_DEFAULT) {
+ cfi_mode = CFI_KCFI;
+ if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
+ cfi_mode = CFI_FINEIBT;
+ }
+
+ /*
+ * Rewrite the callers to not use the __cfi_ stubs, such that we might
+ * rewrite them. This disables all CFI. If this succeeds but any of the
+ * later stages fails, we're without CFI.
+ */
+ ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (cfi_rand) {
+ if (builtin)
+ cfi_seed = get_random_u32();
+
+ ret = cfi_rand_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+ }
+
+ switch (cfi_mode) {
+ case CFI_OFF:
+ if (builtin)
+ pr_info("Disabling CFI\n");
+ return;
+
+ case CFI_KCFI:
+ ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("Using kCFI\n");
+ return;
+
+ case CFI_FINEIBT:
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ if (ret)
+ goto err;
+
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ if (ret)
+ goto err;
+
+ if (builtin)
+ pr_info("Using FineIBT CFI\n");
+ return;
+
+ default:
+ break;
}
+
+err:
+ pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
}
#else
-void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
+static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi, bool builtin)
+{
+}
-#endif /* CONFIG_X86_KERNEL_IBT */
+#endif
+
+void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+ s32 *start_cfi, s32 *end_cfi)
+{
+ return __apply_fineibt(start_retpoline, end_retpoline,
+ start_cfi, end_cfi,
+ /* .builtin = */ false);
+}
#ifdef CONFIG_SMP
static void alternatives_smp_lock(const s32 *start, const s32 *end,
@@ -934,6 +1377,9 @@ void __init alternative_instructions(void)
*/
apply_paravirt(__parainstructions, __parainstructions_end);
+ __apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+ __cfi_sites, __cfi_sites_end, true);
+
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
@@ -947,6 +1393,12 @@ void __init alternative_instructions(void)
*/
apply_alternatives(__alt_instructions, __alt_instructions_end);
+ /*
+ * Now all calls are established. Apply the call thunks if
+ * required.
+ */
+ callthunks_patch_builtin_calls();
+
apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
#ifdef CONFIG_SMP
@@ -1236,27 +1688,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
return __text_poke(text_poke_memcpy, addr, opcode, len);
}
-/**
- * text_poke_copy - Copy instructions into (an unused part of) RX memory
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy, could be more than 2x PAGE_SIZE
- *
- * Not safe against concurrent execution; useful for JITs to dump
- * new code blocks into unused regions of RX memory. Can be used in
- * conjunction with synchronize_rcu_tasks() to wait for existing
- * execution to quiesce after having made sure no existing functions
- * pointers are live.
- */
-void *text_poke_copy(void *addr, const void *opcode, size_t len)
+void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
+ bool core_ok)
{
unsigned long start = (unsigned long)addr;
size_t patched = 0;
- if (WARN_ON_ONCE(core_kernel_text(start)))
+ if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
return NULL;
- mutex_lock(&text_mutex);
while (patched < len) {
unsigned long ptr = start + patched;
size_t s;
@@ -1266,6 +1706,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len)
__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
patched += s;
}
+ return addr;
+}
+
+/**
+ * text_poke_copy - Copy instructions into (an unused part of) RX memory
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy, could be more than 2x PAGE_SIZE
+ *
+ * Not safe against concurrent execution; useful for JITs to dump
+ * new code blocks into unused regions of RX memory. Can be used in
+ * conjunction with synchronize_rcu_tasks() to wait for existing
+ * execution to quiesce after having made sure no existing functions
+ * pointers are live.
+ */
+void *text_poke_copy(void *addr, const void *opcode, size_t len)
+{
+ mutex_lock(&text_mutex);
+ addr = text_poke_copy_locked(addr, opcode, len, false);
mutex_unlock(&text_mutex);
return addr;
}
@@ -1311,6 +1770,11 @@ void text_poke_sync(void)
on_each_cpu(do_sync_core, NULL, 1);
}
+/*
+ * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
+ * this thing. When len == 6 everything is prefixed with 0x0f and we map
+ * opcode to Jcc.d8, using len to distinguish.
+ */
struct text_poke_loc {
/* addr := _stext + rel_addr */
s32 rel_addr;
@@ -1432,6 +1896,10 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
int3_emulate_jmp(regs, (long)ip + tp->disp);
break;
+ case 0x70 ... 0x7f: /* Jcc */
+ int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
+ break;
+
default:
BUG();
}
@@ -1505,16 +1973,26 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
+ u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
+ u8 _new[POKE_MAX_OPCODE_SIZE+1];
+ const u8 *new = tp[i].text;
int len = tp[i].len;
if (len - INT3_INSN_SIZE > 0) {
memcpy(old + INT3_INSN_SIZE,
text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
+
+ if (len == 6) {
+ _new[0] = 0x0f;
+ memcpy(_new + 1, new, 5);
+ new = _new;
+ }
+
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
- (const char *)tp[i].text + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
+
do_sync++;
}
@@ -1542,8 +2020,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* The old instruction is recorded so that the event can be
* processed forwards or backwards.
*/
- perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
- tp[i].text, len);
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
}
if (do_sync) {
@@ -1560,10 +2037,15 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* replacing opcode.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- if (tp[i].text[0] == INT3_INSN_OPCODE)
+ u8 byte = tp[i].text[0];
+
+ if (tp[i].len == 6)
+ byte = 0x0f;
+
+ if (byte == INT3_INSN_OPCODE)
continue;
- text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
+ text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
do_sync++;
}
@@ -1581,9 +2063,11 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
- int ret, i;
+ int ret, i = 0;
- memcpy((void *)tp->text, opcode, len);
+ if (len == 6)
+ i = 1;
+ memcpy((void *)tp->text, opcode+i, len-i);
if (!emulate)
emulate = opcode;
@@ -1594,6 +2078,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
tp->len = len;
tp->opcode = insn.opcode.bytes[0];
+ if (is_jcc32(&insn)) {
+ /*
+ * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
+ */
+ tp->opcode = insn.opcode.bytes[1] - 0x10;
+ }
+
switch (tp->opcode) {
case RET_INSN_OPCODE:
case JMP32_INSN_OPCODE:
@@ -1608,8 +2099,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
default:
BUG_ON(len != insn.length);
- };
-
+ }
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1619,6 +2109,7 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
case CALL_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case JMP8_INSN_OPCODE:
+ case 0x70 ... 0x7f: /* Jcc */
tp->disp = insn.immediate.value;
break;
@@ -1681,11 +2172,6 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
{
struct text_poke_loc *tp;
- if (unlikely(system_state == SYSTEM_BOOTING)) {
- text_poke_early(addr, opcode, len);
- return;
- }
-
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
@@ -1707,11 +2193,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *
{
struct text_poke_loc tp;
- if (unlikely(system_state == SYSTEM_BOOTING)) {
- text_poke_early(addr, opcode, len);
- return;
- }
-
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 19a0207e529f..56a917df410d 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -504,7 +504,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
}
a = aper + iommu_size;
- iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+ iommu_size -= round_up(a, PMD_SIZE) - a;
if (iommu_size < 64*1024*1024) {
pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4266b64631a4..7e331e8f3692 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -36,6 +36,7 @@
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
+#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
@@ -79,6 +80,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
{}
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c6876d3ea4b1..770557110051 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -422,10 +422,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
- rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
- } while (rsvd != new);
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
- rsvd &= ~APIC_EILVT_MASKED;
+ rsvd = new & ~APIC_EILVT_MASKED;
if (rsvd && rsvd != vector)
pr_info("LVT offset %d assigned for vector 0x%02x\n",
offset, rsvd);
@@ -1931,16 +1930,19 @@ void __init check_x2apic(void)
}
}
#else /* CONFIG_X86_X2APIC */
-static int __init validate_x2apic(void)
+void __init check_x2apic(void)
{
if (!apic_is_x2apic_enabled())
- return 0;
+ return;
/*
- * Checkme: Can we simply turn off x2apic here instead of panic?
+ * Checkme: Can we simply turn off x2APIC here instead of disabling the APIC?
*/
- panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+ pr_err("Kernel does not support x2APIC, please recompile with CONFIG_X86_X2APIC.\n");
+ pr_err("Disabling APIC, expect reduced performance and functionality.\n");
+
+ disable_apic = 1;
+ setup_clear_cpu_cap(X86_FEATURE_APIC);
}
-early_initcall(validate_x2apic);
static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a868b76cd3d4..4241dc243aa8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -66,6 +66,7 @@
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/pgtable.h>
+#include <asm/x86_init.h>
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
@@ -2364,9 +2365,8 @@ static int mp_irqdomain_create(int ioapic)
return -ENODEV;
}
- ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
- (void *)(long)ioapic);
-
+ ip->irqdomain = irq_domain_create_hierarchy(parent, 0, hwirqs, fn, cfg->ops,
+ (void *)(long)ioapic);
if (!ip->irqdomain) {
/* Release fw handle if it was allocated above */
if (!cfg->dev)
@@ -2374,8 +2374,6 @@ static int mp_irqdomain_create(int ioapic)
return -ENOMEM;
}
- ip->irqdomain->parent = parent;
-
if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
cfg->type == IOAPIC_DOMAIN_STRICT)
ioapic_dynirq_base = max(ioapic_dynirq_base,
@@ -2480,17 +2478,21 @@ static int io_apic_get_redir_entries(int ioapic)
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
+ unsigned int ret;
+
/*
* dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
* gsi_top if ioapic_dynirq_base hasn't been initialized yet.
*/
- if (!ioapic_initialized)
- return gsi_top;
+ ret = ioapic_dynirq_base ? : gsi_top;
+
/*
- * For DT enabled machines ioapic_dynirq_base is irrelevant and not
- * updated. So simply return @from if ioapic_dynirq_base == 0.
+ * For DT enabled machines ioapic_dynirq_base is irrelevant and
+ * always 0. gsi_top can be 0 if there is no IO/APIC registered.
+ * 0 is an invalid interrupt number for dynamic allocations. Return
+ * @from instead.
*/
- return ioapic_dynirq_base ? : from;
+ return ret ? : from;
}
#ifdef CONFIG_X86_32
@@ -2683,10 +2685,15 @@ static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
pgprot_t flags = FIXMAP_PAGE_NOCACHE;
/*
- * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot
+ * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot
* bits, just like normal ioremap():
*/
- flags = pgprot_decrypted(flags);
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (x86_platform.hyper.is_private_mmio(phys))
+ flags = pgprot_encrypted(flags);
+ else
+ flags = pgprot_decrypted(flags);
+ }
__set_fixmap(idx, phys, flags);
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7517eb05bdc1..35d5b8fb18ef 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -142,70 +142,139 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
return ret;
}
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
+/**
+ * pci_dev_has_default_msi_parent_domain - Check whether the device has the default
+ * MSI parent domain associated
+ * @dev: Pointer to the PCI device
*/
-static struct irq_chip pci_msi_controller = {
- .name = "PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_set_affinity = msi_set_affinity,
- .flags = IRQCHIP_SKIP_SET_WAKE |
- IRQCHIP_AFFINITY_PRE_STARTUP,
-};
+bool pci_dev_has_default_msi_parent_domain(struct pci_dev *dev)
+{
+ struct irq_domain *domain = dev_get_msi_domain(&dev->dev);
-int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
- msi_alloc_info_t *arg)
+ if (!domain)
+ domain = dev_get_msi_domain(&dev->bus->dev);
+ if (!domain)
+ return false;
+
+ return domain == x86_vector_domain;
+}
+
+/**
+ * x86_msi_prepare - Setup of msi_alloc_info_t for allocations
+ * @domain: The domain for which this setup happens
+ * @dev: The device for which interrupts are allocated
+ * @nvec: The number of vectors to allocate
+ * @alloc: The allocation info structure to initialize
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. It is always invoked from the
+ * top level interrupt domain. The domain specific allocation
+ * functionality is determined via the @domain's bus token which allows to
+ * map the X86 specific allocation type.
+ */
+static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *alloc)
{
- init_irq_alloc_info(arg, NULL);
- if (to_pci_dev(dev)->msix_enabled) {
- arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
- } else {
- arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
- arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+ struct msi_domain_info *info = domain->host_data;
+
+ init_irq_alloc_info(alloc, NULL);
+
+ switch (info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ case DOMAIN_BUS_PCI_DEVICE_IMS:
+ alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ return 0;
+ default:
+ return -EINVAL;
}
-
- return 0;
}
-EXPORT_SYMBOL_GPL(pci_msi_prepare);
-static struct msi_domain_ops pci_msi_domain_ops = {
- .msi_prepare = pci_msi_prepare,
-};
+/**
+ * x86_init_dev_msi_info - Domain info setup for MSI domains
+ * @dev: The device for which the domain should be created
+ * @domain: The (root) domain providing this callback
+ * @real_parent: The real parent domain of the to initialize domain
+ * @info: The domain info for the to initialize domain
+ *
+ * This function is to be used for all types of MSI domains above the x86
+ * vector domain and any intermediates. The domain specific functionality
+ * is determined via the @real_parent.
+ */
+static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ const struct msi_parent_ops *pops = real_parent->msi_parent_ops;
+
+ /* MSI parent domain specific settings */
+ switch (real_parent->bus_token) {
+ case DOMAIN_BUS_ANY:
+ /* Only the vector domain can have the ANY token */
+ if (WARN_ON_ONCE(domain != real_parent))
+ return false;
+ info->chip->irq_set_affinity = msi_set_affinity;
+ /* See msi_set_affinity() for the gory details */
+ info->flags |= MSI_FLAG_NOMASK_QUIRK;
+ break;
+ case DOMAIN_BUS_DMAR:
+ case DOMAIN_BUS_AMDVI:
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Is the target supported? */
+ switch(info->bus_token) {
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
+ break;
+ case DOMAIN_BUS_PCI_DEVICE_IMS:
+ if (!(pops->supported_flags & MSI_FLAG_PCI_IMS))
+ return false;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Mask out the domain specific MSI feature flags which are not
+ * supported by the real parent.
+ */
+ info->flags &= pops->supported_flags;
+ /* Enforce the required flags */
+ info->flags |= X86_VECTOR_MSI_FLAGS_REQUIRED;
+
+ /* This is always invoked from the top level MSI domain! */
+ info->ops->msi_prepare = x86_msi_prepare;
+
+ info->chip->irq_ack = irq_chip_ack_parent;
+ info->chip->irq_retrigger = irq_chip_retrigger_hierarchy;
+ info->chip->flags |= IRQCHIP_SKIP_SET_WAKE |
+ IRQCHIP_AFFINITY_PRE_STARTUP;
-static struct msi_domain_info pci_msi_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
+ info->handler = handle_edge_irq;
+ info->handler_name = "edge";
+
+ return true;
+}
+
+static const struct msi_parent_ops x86_vector_msi_parent_ops = {
+ .supported_flags = X86_VECTOR_MSI_FLAGS_SUPPORTED,
+ .init_dev_msi_info = x86_init_dev_msi_info,
};
struct irq_domain * __init native_create_pci_msi_domain(void)
{
- struct fwnode_handle *fn;
- struct irq_domain *d;
-
if (disable_apic)
return NULL;
- fn = irq_domain_alloc_named_fwnode("PCI-MSI");
- if (!fn)
- return NULL;
-
- d = pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
- x86_vector_domain);
- if (!d) {
- irq_domain_free_fwnode(fn);
- pr_warn("Failed to initialize PCI-MSI irqdomain.\n");
- } else {
- d->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
- }
- return d;
+ x86_vector_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT;
+ x86_vector_domain->msi_parent_ops = &x86_vector_msi_parent_ops;
+ return x86_vector_domain;
}
void __init x86_create_pci_msi_domain(void)
@@ -213,41 +282,19 @@ void __init x86_create_pci_msi_domain(void)
x86_pci_msi_default_domain = x86_init.irqs.create_pci_msi_domain();
}
-#ifdef CONFIG_IRQ_REMAP
-static struct irq_chip pci_msi_ir_controller = {
- .name = "IR-PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE |
- IRQCHIP_AFFINITY_PRE_STARTUP,
-};
-
-static struct msi_domain_info pci_msi_ir_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
- .ops = &pci_msi_domain_ops,
- .chip = &pci_msi_ir_controller,
- .handler = handle_edge_irq,
- .handler_name = "edge",
-};
-
-struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
- const char *name, int id)
+/* Keep around for hyperV */
+int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec,
+ msi_alloc_info_t *arg)
{
- struct fwnode_handle *fn;
- struct irq_domain *d;
+ init_irq_alloc_info(arg, NULL);
- fn = irq_domain_alloc_named_id_fwnode(name, id);
- if (!fn)
- return NULL;
- d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- if (!d)
- irq_domain_free_fwnode(fn);
- return d;
+ if (to_pci_dev(dev)->msix_enabled)
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
+ else
+ arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
+ return 0;
}
-#endif
+EXPORT_SYMBOL_GPL(pci_msi_prepare);
#ifdef CONFIG_DMAR_TABLE
/*
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3e6f6b448f6a..c1efebd27e6c 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -539,10 +539,6 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
if (disable_apic)
return -ENXIO;
- /* Currently vector allocator can't guarantee contiguous allocations */
- if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
- return -ENOSYS;
-
/*
* Catch any attempt to touch the cascade interrupt on a PIC
* equipped system.
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e696e22d0531..b2b2b7f3e03f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -9,11 +9,7 @@
#include "local.h"
-struct cluster_mask {
- unsigned int clusterid;
- int node;
- struct cpumask mask;
-};
+#define apic_cluster(apicid) ((apicid) >> 4)
/*
* __x2apic_send_IPI_mask() possibly needs to read
@@ -23,8 +19,7 @@ struct cluster_mask {
static u32 *x86_cpu_to_logical_apicid __read_mostly;
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
@@ -60,10 +55,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
for_each_cpu(cpu, tmpmsk) {
- struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
dest = 0;
- for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+ for_each_cpu_and(clustercpu, tmpmsk, cmsk)
dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
@@ -71,7 +66,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
/* Remove cluster CPUs from tmpmask */
- cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+ cpumask_andnot(tmpmsk, tmpmsk, cmsk);
}
local_irq_restore(flags);
@@ -105,55 +100,98 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
static void init_x2apic_ldr(void)
{
- struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
- u32 cluster, apicid = apic_read(APIC_LDR);
- unsigned int cpu;
+ struct cpumask *cmsk = this_cpu_read(cluster_masks);
- x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
+ BUG_ON(!cmsk);
- if (cmsk)
- goto update;
-
- cluster = apicid >> 16;
- for_each_online_cpu(cpu) {
- cmsk = per_cpu(cluster_masks, cpu);
- /* Matching cluster found. Link and update it. */
- if (cmsk && cmsk->clusterid == cluster)
- goto update;
+ cpumask_set_cpu(smp_processor_id(), cmsk);
+}
+
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
+{
+ int cpu_i;
+
+ for_each_present_cpu(cpu_i) {
+ struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i);
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+ continue;
+
+ if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
+ continue;
+
+ BUG_ON(*cpu_cmsk);
+ *cpu_cmsk = cmsk;
}
- cmsk = cluster_hotplug_mask;
- cmsk->clusterid = cluster;
- cluster_hotplug_mask = NULL;
-update:
- this_cpu_write(cluster_masks, cmsk);
- cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
}
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
+ struct cpumask *cmsk = NULL;
+ unsigned int cpu_i;
+
+ /*
+ * At boot time, the CPU present mask is stable. The cluster mask is
+ * allocated for the first CPU in the cluster and propagated to all
+ * present siblings in the cluster. If the cluster mask is already set
+ * on entry to this function for a given CPU, there is nothing to do.
+ */
if (per_cpu(cluster_masks, cpu))
return 0;
+
+ if (system_state < SYSTEM_RUNNING)
+ goto alloc;
+
/*
- * If a hotplug spare mask exists, check whether it's on the right
- * node. If not, free it and allocate a new one.
+ * On post boot hotplug for a CPU which was not present at boot time,
+ * iterate over all possible CPUs (even those which are not present
+ * any more) to find any existing cluster mask.
*/
- if (cluster_hotplug_mask) {
- if (cluster_hotplug_mask->node == node)
- return 0;
- kfree(cluster_hotplug_mask);
+ for_each_possible_cpu(cpu_i) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu_i);
+ /*
+ * If the cluster is already initialized, just store
+ * the mask and return. There's no need to propagate.
+ */
+ if (cmsk) {
+ per_cpu(cluster_masks, cpu) = cmsk;
+ return 0;
+ }
+ }
}
-
- cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
- GFP_KERNEL, node);
- if (!cluster_hotplug_mask)
+ /*
+ * No CPU in the cluster has ever been initialized, so fall through to
+ * the boot time code which will also populate the cluster mask for any
+ * other CPU in the cluster which is (now) present.
+ */
+alloc:
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
return -ENOMEM;
- cluster_hotplug_mask->node = node;
+ per_cpu(cluster_masks, cpu) = cmsk;
+ prefill_clustermask(cmsk, cpu, cluster);
+
return 0;
}
static int x2apic_prepare_cpu(unsigned int cpu)
{
- if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = apic_cluster(phys_apicid);
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+ x86_cpu_to_logical_apicid[cpu] = logical_apicid;
+
+ if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
return -ENOMEM;
if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
return -ENOMEM;
@@ -162,10 +200,10 @@ static int x2apic_prepare_cpu(unsigned int cpu)
static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
if (cmsk)
- cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ cpumask_clear_cpu(dead_cpu, cmsk);
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6bde05a86b4e..896bc41cb2ba 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -97,7 +97,10 @@ static void init_x2apic_ldr(void)
static int x2apic_phys_probe(void)
{
- if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
+ if (!x2apic_mode)
+ return 0;
+
+ if (x2apic_phys || x2apic_fadt_phys())
return 1;
return apic == &apic_x2apic_phys;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 60e330cdbd17..c6c15ce1952f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
- ibt = ibt_save();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
apm_bios_call_asm(call->func, call->ebx, call->ecx,
&call->eax, &call->ebx, &call->ecx, &call->edx,
@@ -690,7 +690,7 @@ static long __apm_bios_call_simple(void *_call)
apm_irq_save(flags);
firmware_restrict_branch_speculation_start();
- ibt = ibt_save();
+ ibt = ibt_save(true);
APM_DO_SAVE_SEGS;
error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx,
&call->eax);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 437308004ef2..dc3576303f1a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -7,6 +7,7 @@
#define COMPILE_OFFSETS
#include <linux/crypto.h>
+#include <crypto/aria.h>
#include <linux/sched.h>
#include <linux/stddef.h>
#include <linux/hardirq.h>
@@ -75,12 +76,18 @@ static void __used common(void)
OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
BLANK();
+ OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8);
+ OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9);
OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
+ OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi);
+ OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi);
+ OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx);
+ OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx);
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
@@ -107,4 +114,17 @@ static void __used common(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
+ OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+ OFFSET(X86_current_task, pcpu_hot, current_task);
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+ OFFSET(X86_call_depth, pcpu_hot, call_depth);
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
+ /* Offset for fields in aria_ctx */
+ BLANK();
+ OFFSET(ARIA_CTX_enc_key, aria_ctx, enc_key);
+ OFFSET(ARIA_CTX_dec_key, aria_ctx, dec_key);
+ OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
+#endif
+
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 9b698215d261..bb65371ea9df 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -57,7 +57,7 @@ int main(void)
BLANK();
#ifdef CONFIG_STACKPROTECTOR
- DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary));
+ OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary);
BLANK();
#endif
return 0;
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
new file mode 100644
index 000000000000..22ab13966427
--- /dev/null
+++ b/arch/x86/kernel/callthunks.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "callthunks: " fmt
+
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/memory.h>
+#include <linux/moduleloader.h>
+#include <linux/static_call.h>
+
+#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
+#include <asm/cpu.h>
+#include <asm/ftrace.h>
+#include <asm/insn.h>
+#include <asm/kexec.h>
+#include <asm/nospec-branch.h>
+#include <asm/paravirt.h>
+#include <asm/sections.h>
+#include <asm/switch_to.h>
+#include <asm/sync_core.h>
+#include <asm/text-patching.h>
+#include <asm/xen/hypercall.h>
+
+static int __initdata_or_module debug_callthunks;
+
+#define prdbg(fmt, args...) \
+do { \
+ if (debug_callthunks) \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
+} while(0)
+
+static int __init debug_thunks(char *str)
+{
+ debug_callthunks = 1;
+ return 1;
+}
+__setup("debug-callthunks", debug_thunks);
+
+#ifdef CONFIG_CALL_THUNKS_DEBUG
+DEFINE_PER_CPU(u64, __x86_call_count);
+DEFINE_PER_CPU(u64, __x86_ret_count);
+DEFINE_PER_CPU(u64, __x86_stuffs_count);
+DEFINE_PER_CPU(u64, __x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_SYMBOL_GPL(__x86_call_count);
+#endif
+
+extern s32 __call_sites[], __call_sites_end[];
+
+struct thunk_desc {
+ void *template;
+ unsigned int template_size;
+};
+
+struct core_text {
+ unsigned long base;
+ unsigned long end;
+ const char *name;
+};
+
+static bool thunks_initialized __ro_after_init;
+
+static const struct core_text builtin_coretext = {
+ .base = (unsigned long)_text,
+ .end = (unsigned long)_etext,
+ .name = "builtin",
+};
+
+asm (
+ ".pushsection .rodata \n"
+ ".global skl_call_thunk_template \n"
+ "skl_call_thunk_template: \n"
+ __stringify(INCREMENT_CALL_DEPTH)" \n"
+ ".global skl_call_thunk_tail \n"
+ "skl_call_thunk_tail: \n"
+ ".popsection \n"
+);
+
+extern u8 skl_call_thunk_template[];
+extern u8 skl_call_thunk_tail[];
+
+#define SKL_TMPL_SIZE \
+ ((unsigned int)(skl_call_thunk_tail - skl_call_thunk_template))
+
+extern void error_entry(void);
+extern void xen_error_entry(void);
+extern void paranoid_entry(void);
+
+static inline bool within_coretext(const struct core_text *ct, void *addr)
+{
+ unsigned long p = (unsigned long)addr;
+
+ return ct->base <= p && p < ct->end;
+}
+
+static inline bool within_module_coretext(void *addr)
+{
+ bool ret = false;
+
+#ifdef CONFIG_MODULES
+ struct module *mod;
+
+ preempt_disable();
+ mod = __module_address((unsigned long)addr);
+ if (mod && within_module_core((unsigned long)addr, mod))
+ ret = true;
+ preempt_enable();
+#endif
+ return ret;
+}
+
+static bool is_coretext(const struct core_text *ct, void *addr)
+{
+ if (ct && within_coretext(ct, addr))
+ return true;
+ if (within_coretext(&builtin_coretext, addr))
+ return true;
+ return within_module_coretext(addr);
+}
+
+static bool skip_addr(void *dest)
+{
+ if (dest == error_entry)
+ return true;
+ if (dest == paranoid_entry)
+ return true;
+ if (dest == xen_error_entry)
+ return true;
+ /* Does FILL_RSB... */
+ if (dest == __switch_to_asm)
+ return true;
+ /* Accounts directly */
+ if (dest == ret_from_fork)
+ return true;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (dest == start_cpu0)
+ return true;
+#endif
+#ifdef CONFIG_FUNCTION_TRACER
+ if (dest == __fentry__)
+ return true;
+#endif
+#ifdef CONFIG_KEXEC_CORE
+ if (dest >= (void *)relocate_kernel &&
+ dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
+ return true;
+#endif
+#ifdef CONFIG_XEN
+ if (dest >= (void *)hypercall_page &&
+ dest < (void*)hypercall_page + PAGE_SIZE)
+ return true;
+#endif
+ return false;
+}
+
+static __init_or_module void *call_get_dest(void *addr)
+{
+ struct insn insn;
+ void *dest;
+ int ret;
+
+ ret = insn_decode_kernel(&insn, addr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Patched out call? */
+ if (insn.opcode.bytes[0] != CALL_INSN_OPCODE)
+ return NULL;
+
+ dest = addr + insn.length + insn.immediate.value;
+ if (skip_addr(dest))
+ return NULL;
+ return dest;
+}
+
+static const u8 nops[] = {
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+ 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90,
+};
+
+static void *patch_dest(void *dest, bool direct)
+{
+ unsigned int tsize = SKL_TMPL_SIZE;
+ u8 *pad = dest - tsize;
+
+ /* Already patched? */
+ if (!bcmp(pad, skl_call_thunk_template, tsize))
+ return pad;
+
+ /* Ensure there are nops */
+ if (bcmp(pad, nops, tsize)) {
+ pr_warn_once("Invalid padding area for %pS\n", dest);
+ return NULL;
+ }
+
+ if (direct)
+ memcpy(pad, skl_call_thunk_template, tsize);
+ else
+ text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+ return pad;
+}
+
+static __init_or_module void patch_call(void *addr, const struct core_text *ct)
+{
+ void *pad, *dest;
+ u8 bytes[8];
+
+ if (!within_coretext(ct, addr))
+ return;
+
+ dest = call_get_dest(addr);
+ if (!dest || WARN_ON_ONCE(IS_ERR(dest)))
+ return;
+
+ if (!is_coretext(ct, dest))
+ return;
+
+ pad = patch_dest(dest, within_coretext(ct, dest));
+ if (!pad)
+ return;
+
+ prdbg("Patch call at: %pS %px to %pS %px -> %px \n", addr, addr,
+ dest, dest, pad);
+ __text_gen_insn(bytes, CALL_INSN_OPCODE, addr, pad, CALL_INSN_SIZE);
+ text_poke_early(addr, bytes, CALL_INSN_SIZE);
+}
+
+static __init_or_module void
+patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
+{
+ s32 *s;
+
+ for (s = start; s < end; s++)
+ patch_call((void *)s + *s, ct);
+}
+
+static __init_or_module void
+patch_paravirt_call_sites(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end,
+ const struct core_text *ct)
+{
+ struct paravirt_patch_site *p;
+
+ for (p = start; p < end; p++)
+ patch_call(p->instr, ct);
+}
+
+static __init_or_module void
+callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
+{
+ prdbg("Patching call sites %s\n", ct->name);
+ patch_call_sites(cs->call_start, cs->call_end, ct);
+ patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ prdbg("Patching call sites done%s\n", ct->name);
+}
+
+void __init callthunks_patch_builtin_calls(void)
+{
+ struct callthunk_sites cs = {
+ .call_start = __call_sites,
+ .call_end = __call_sites_end,
+ .pv_start = __parainstructions,
+ .pv_end = __parainstructions_end
+ };
+
+ if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ return;
+
+ pr_info("Setting up call depth tracking\n");
+ mutex_lock(&text_mutex);
+ callthunks_setup(&cs, &builtin_coretext);
+ static_call_force_reinit();
+ thunks_initialized = true;
+ mutex_unlock(&text_mutex);
+}
+
+void *callthunks_translate_call_dest(void *dest)
+{
+ void *target;
+
+ lockdep_assert_held(&text_mutex);
+
+ if (!thunks_initialized || skip_addr(dest))
+ return dest;
+
+ if (!is_coretext(NULL, dest))
+ return dest;
+
+ target = patch_dest(dest, false);
+ return target ? : dest;
+}
+
+bool is_callthunk(void *addr)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ void *tmpl = skl_call_thunk_template;
+ unsigned long dest;
+
+ dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
+ if (!thunks_initialized || skip_addr((void *)dest))
+ return false;
+
+ return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+}
+
+#ifdef CONFIG_BPF_JIT
+int x86_call_depth_emit_accounting(u8 **pprog, void *func)
+{
+ unsigned int tmpl_size = SKL_TMPL_SIZE;
+ void *tmpl = skl_call_thunk_template;
+
+ if (!thunks_initialized)
+ return 0;
+
+ /* Is function call target a thunk? */
+ if (func && is_callthunk(func))
+ return 0;
+
+ memcpy(*pprog, tmpl, tmpl_size);
+ *pprog += tmpl_size;
+ return tmpl_size;
+}
+#endif
+
+#ifdef CONFIG_MODULES
+void noinline callthunks_patch_module_calls(struct callthunk_sites *cs,
+ struct module *mod)
+{
+ struct core_text ct = {
+ .base = (unsigned long)mod->mem[MOD_TEXT].base,
+ .end = (unsigned long)mod->mem[MOD_TEXT].base + mod->mem[MOD_TEXT].size,
+ .name = mod->name,
+ };
+
+ if (!thunks_initialized)
+ return;
+
+ mutex_lock(&text_mutex);
+ callthunks_setup(cs, &ct);
+ mutex_unlock(&text_mutex);
+}
+#endif /* CONFIG_MODULES */
+
+#if defined(CONFIG_CALL_THUNKS_DEBUG) && defined(CONFIG_DEBUG_FS)
+static int callthunks_debug_show(struct seq_file *m, void *p)
+{
+ unsigned long cpu = (unsigned long)m->private;
+
+ seq_printf(m, "C: %16llu R: %16llu S: %16llu X: %16llu\n,",
+ per_cpu(__x86_call_count, cpu),
+ per_cpu(__x86_ret_count, cpu),
+ per_cpu(__x86_stuffs_count, cpu),
+ per_cpu(__x86_ctxsw_count, cpu));
+ return 0;
+}
+
+static int callthunks_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, callthunks_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_ops = {
+ .open = callthunks_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init callthunks_debugfs_init(void)
+{
+ struct dentry *dir;
+ unsigned long cpu;
+
+ dir = debugfs_create_dir("callthunks", NULL);
+ for_each_possible_cpu(cpu) {
+ void *arg = (void *)cpu;
+ char name [10];
+
+ sprintf(name, "cpu%lu", cpu);
+ debugfs_create_file(name, 0644, dir, arg, &dfs_ops);
+ }
+ return 0;
+}
+__initcall(callthunks_debugfs_init);
+#endif
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index f10a921ee756..d7e3ceaf75c1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,9 +17,6 @@ KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n
-# Make sure load_percpu_segment has no stackprotector
-CFLAGS_common.o := -fno-stack-protector
-
obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 860b60273df3..571abf808ea3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
}
-#define MSR_AMD64_DE_CFG 0xC0011029
-
static void init_amd_ln(struct cpuinfo_x86 *c)
{
/*
@@ -882,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
}
}
#endif
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
static void init_amd_zn(struct cpuinfo_x86 *c)
@@ -922,6 +929,10 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ /* AMD FSRM also implies FSRS */
+ if (cpu_has(c, X86_FEATURE_FSRM))
+ set_cpu_cap(c, X86_FEATURE_FSRS);
+
/* get apicid instead of initial apic id from cpuid */
c->apicid = hard_smp_processor_id();
@@ -958,15 +969,15 @@ static void init_amd(struct cpuinfo_x86 *c)
init_amd_cacheinfo(c);
- if (cpu_has(c, X86_FEATURE_XMM2)) {
+ if (!cpu_has(c, X86_FEATURE_LFENCE_RDTSC) && cpu_has(c, X86_FEATURE_XMM2)) {
/*
* Use LFENCE for execution serialization. On families which
* don't have that MSR, LFENCE is already serializing.
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -985,7 +996,7 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
/*
@@ -998,6 +1009,17 @@ static void init_amd(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
+
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
}
#ifdef CONFIG_X86_32
@@ -1160,24 +1182,43 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
return false;
}
-void set_dr_addr_mask(unsigned long mask, int dr)
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long[4], amd_dr_addr_mask);
+
+static unsigned int amd_msr_dr_addr_masks[] = {
+ MSR_F16H_DR0_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK,
+ MSR_F16H_DR1_ADDR_MASK + 1,
+ MSR_F16H_DR1_ADDR_MASK + 2
+};
+
+void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr)
{
- if (!boot_cpu_has(X86_FEATURE_BPEXT))
+ int cpu = smp_processor_id();
+
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
return;
- switch (dr) {
- case 0:
- wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
- break;
- case 1:
- case 2:
- case 3:
- wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
- break;
- default:
- break;
- }
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return;
+
+ if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask)
+ return;
+
+ wrmsr(amd_msr_dr_addr_masks[dr], mask, 0);
+ per_cpu(amd_dr_addr_mask, cpu)[dr] = mask;
+}
+
+unsigned long amd_get_dr_addr_mask(unsigned int dr)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_BPEXT))
+ return 0;
+
+ if (WARN_ON_ONCE(dr >= ARRAY_SIZE(amd_msr_dr_addr_masks)))
+ return 0;
+
+ return per_cpu(amd_dr_addr_mask[dr], smp_processor_id());
}
+EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
u32 amd_get_highest_perf(void)
{
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 1f60a2b27936..fdbb5f07448f 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -330,7 +330,16 @@ static void __init bp_init_freq_invariance(void)
static void disable_freq_invariance_workfn(struct work_struct *work)
{
+ int cpu;
+
static_branch_disable(&arch_scale_freq_key);
+
+ /*
+ * Set arch_freq_scale to a default value on all cpus
+ * This negates the effect of scaling
+ */
+ for_each_possible_cpu(cpu)
+ per_cpu(arch_freq_scale, cpu) = SCHED_CAPACITY_SCALE;
}
static DECLARE_WORK(disable_freq_invariance_work,
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 3e3230cccaa7..182af64387d0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -33,6 +33,7 @@
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
+#include <asm/cpu.h>
#include "cpu.h"
@@ -60,11 +61,18 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
static DEFINE_MUTEX(spec_ctrl_mutex);
+/* Update SPEC_CTRL MSR and its cached copy unconditionally */
+static void update_spec_ctrl(u64 val)
+{
+ this_cpu_write(x86_spec_ctrl_current, val);
+ wrmsrl(MSR_IA32_SPEC_CTRL, val);
+}
+
/*
* Keep track of the SPEC_CTRL MSR value for the current task, which may differ
* from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
*/
-void write_spec_ctrl_current(u64 val, bool force)
+void update_spec_ctrl_cond(u64 val)
{
if (this_cpu_read(x86_spec_ctrl_current) == val)
return;
@@ -75,11 +83,11 @@ void write_spec_ctrl_current(u64 val, bool force)
* When KERNEL_IBRS this MSR is written on return-to-user, unless
* forced the update can be delayed until that time.
*/
- if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
+ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
wrmsrl(MSR_IA32_SPEC_CTRL, val);
}
-u64 spec_ctrl_current(void)
+noinstr u64 spec_ctrl_current(void)
{
return this_cpu_read(x86_spec_ctrl_current);
}
@@ -137,9 +145,17 @@ void __init check_bugs(void)
* have unknown values. AMD64_LS_CFG MSR is cached in the early AMD
* init code as it is not enumerated and depends on the family.
*/
- if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
+ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) {
rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
+ /*
+ * Previously running kernel (kexec), may have some controls
+ * turned ON. Clear them and let the mitigations setup below
+ * rediscover them based on configuration.
+ */
+ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
+ }
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -768,8 +784,7 @@ static int __init nospectre_v1_cmdline(char *str)
}
early_param("nospectre_v1", nospectre_v1_cmdline);
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
@@ -780,6 +795,7 @@ enum retbleed_mitigation {
RETBLEED_MITIGATION_IBPB,
RETBLEED_MITIGATION_IBRS,
RETBLEED_MITIGATION_EIBRS,
+ RETBLEED_MITIGATION_STUFF,
};
enum retbleed_mitigation_cmd {
@@ -787,6 +803,7 @@ enum retbleed_mitigation_cmd {
RETBLEED_CMD_AUTO,
RETBLEED_CMD_UNRET,
RETBLEED_CMD_IBPB,
+ RETBLEED_CMD_STUFF,
};
static const char * const retbleed_strings[] = {
@@ -795,6 +812,7 @@ static const char * const retbleed_strings[] = {
[RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB",
[RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
[RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
+ [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing",
};
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
@@ -824,8 +842,12 @@ static int __init retbleed_parse_cmdline(char *str)
retbleed_cmd = RETBLEED_CMD_UNRET;
} else if (!strcmp(str, "ibpb")) {
retbleed_cmd = RETBLEED_CMD_IBPB;
+ } else if (!strcmp(str, "stuff")) {
+ retbleed_cmd = RETBLEED_CMD_STUFF;
} else if (!strcmp(str, "nosmt")) {
retbleed_nosmt = true;
+ } else if (!strcmp(str, "force")) {
+ setup_force_cpu_bug(X86_BUG_RETBLEED);
} else {
pr_err("Ignoring unknown retbleed option (%s).", str);
}
@@ -872,6 +894,21 @@ static void __init retbleed_select_mitigation(void)
}
break;
+ case RETBLEED_CMD_STUFF:
+ if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) &&
+ spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
+ retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
+
+ } else {
+ if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING))
+ pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
+ else
+ pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n");
+
+ goto do_cmd_auto;
+ }
+ break;
+
do_cmd_auto:
case RETBLEED_CMD_AUTO:
default:
@@ -909,6 +946,12 @@ do_cmd_auto:
mitigate_smt = true;
break;
+ case RETBLEED_MITIGATION_STUFF:
+ setup_force_cpu_cap(X86_FEATURE_RETHUNK);
+ setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH);
+ x86_set_skl_return_thunk();
+ break;
+
default:
break;
}
@@ -919,7 +962,7 @@ do_cmd_auto:
/*
* Let IBRS trump all on Intel without affecting the effects of the
- * retbleed= cmdline option.
+ * retbleed= cmdline option except for call depth based stuffing
*/
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
switch (spectre_v2_enabled) {
@@ -932,7 +975,8 @@ do_cmd_auto:
retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
break;
default:
- pr_err(RETBLEED_INTEL_MSG);
+ if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF)
+ pr_err(RETBLEED_INTEL_MSG);
}
}
@@ -1090,10 +1134,7 @@ spectre_v2_parse_user_cmdline(void)
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
{
- return mode == SPECTRE_V2_IBRS ||
- mode == SPECTRE_V2_EIBRS ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_EIBRS_LFENCE;
+ return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
}
static void __init
@@ -1158,12 +1199,19 @@ spectre_v2_user_select_mitigation(void)
}
/*
- * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible,
- * STIBP is not required.
+ * If no STIBP, enhanced IBRS is enabled, or SMT impossible, STIBP
+ * is not required.
+ *
+ * Enhanced IBRS also protects against cross-thread branch target
+ * injection in user-mode as the IBRS bit remains always set which
+ * implicitly enables cross-thread protections. However, in legacy IBRS
+ * mode, the IBRS bit is set only on kernel entry and cleared on return
+ * to userspace. This disables the implicit cross-thread protection,
+ * so allow for STIBP to be selected in that case.
*/
if (!boot_cpu_has(X86_FEATURE_STIBP) ||
!smt_possible ||
- spectre_v2_in_ibrs_mode(spectre_v2_enabled))
+ spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return;
/*
@@ -1193,9 +1241,9 @@ static const char * const spectre_v2_strings[] = {
[SPECTRE_V2_NONE] = "Vulnerable",
[SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines",
[SPECTRE_V2_LFENCE] = "Mitigation: LFENCE",
- [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
- [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
- [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
+ [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS",
+ [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE",
+ [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines",
[SPECTRE_V2_IBRS] = "Mitigation: IBRS",
};
@@ -1264,7 +1312,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
!boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) {
- pr_err("%s selected but CPU doesn't have eIBRS. Switching to AUTO select\n",
+ pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
}
@@ -1295,7 +1343,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) {
pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1328,7 +1376,7 @@ static void __init spec_ctrl_disable_kernel_rrsba(void)
if (ia32_cap & ARCH_CAP_RRSBA) {
x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
@@ -1406,6 +1454,7 @@ static void __init spectre_v2_select_mitigation(void)
if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
boot_cpu_has_bug(X86_BUG_RETBLEED) &&
retbleed_cmd != RETBLEED_CMD_OFF &&
+ retbleed_cmd != RETBLEED_CMD_STUFF &&
boot_cpu_has(X86_FEATURE_IBRS) &&
boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
mode = SPECTRE_V2_IBRS;
@@ -1449,8 +1498,12 @@ static void __init spectre_v2_select_mitigation(void)
pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
if (spectre_v2_in_ibrs_mode(mode)) {
- x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) {
+ msr_set_bit(MSR_EFER, _EFER_AUTOIBRS);
+ } else {
+ x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ }
}
switch (mode) {
@@ -1534,8 +1587,8 @@ static void __init spectre_v2_select_mitigation(void)
/*
* Retpoline protects the kernel, but doesn't protect firmware. IBRS
* and Enhanced IBRS protect firmware too, so enable IBRS around
- * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
- * enabled.
+ * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't
+ * otherwise enabled.
*
* Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
* the user might select retpoline on the kernel command line and if
@@ -1564,7 +1617,7 @@ static void __init spectre_v2_select_mitigation(void)
static void update_stibp_msr(void * __unused)
{
u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
- write_spec_ctrl_current(val, true);
+ update_spec_ctrl(val);
}
/* Update x86_spec_ctrl_base in case SMT state changed. */
@@ -1797,7 +1850,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
x86_amd_ssb_disable();
} else {
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
}
}
@@ -1944,6 +1997,8 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
if (ctrl == PR_SPEC_FORCE_DISABLE)
task_set_spec_ib_force_disable(task);
task_update_spec_tif(task);
+ if (task == current)
+ indirect_branch_prediction_barrier();
break;
default:
return -ERANGE;
@@ -2048,7 +2103,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
void x86_spec_ctrl_setup_ap(void)
{
if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
- write_spec_ctrl_current(x86_spec_ctrl_base, true);
+ update_spec_ctrl(x86_spec_ctrl_base);
if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
x86_amd_ssb_disable();
@@ -2199,74 +2254,74 @@ static const char * const l1tf_vmx_states[] = {
static ssize_t l1tf_show_state(char *buf)
{
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
sched_smt_active())) {
- return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation]);
+ return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation]);
}
- return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
- l1tf_vmx_states[l1tf_vmx_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
+ l1tf_vmx_states[l1tf_vmx_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t itlb_multihit_show_state(char *buf)
{
if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
!boot_cpu_has(X86_FEATURE_VMX))
- return sprintf(buf, "KVM: Mitigation: VMX unsupported\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n");
else if (!(cr4_read_shadow() & X86_CR4_VMXE))
- return sprintf(buf, "KVM: Mitigation: VMX disabled\n");
+ return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n");
else if (itlb_multihit_kvm_mitigation)
- return sprintf(buf, "KVM: Mitigation: Split huge pages\n");
+ return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n");
else
- return sprintf(buf, "KVM: Vulnerable\n");
+ return sysfs_emit(buf, "KVM: Vulnerable\n");
}
#else
static ssize_t l1tf_show_state(char *buf)
{
- return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
+ return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG);
}
static ssize_t itlb_multihit_show_state(char *buf)
{
- return sprintf(buf, "Processor vulnerable\n");
+ return sysfs_emit(buf, "Processor vulnerable\n");
}
#endif
static ssize_t mds_show_state(char *buf)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- mds_strings[mds_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ mds_strings[mds_mitigation]);
}
if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
- sched_smt_active() ? "mitigated" : "disabled"));
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
+ sched_smt_active() ? "mitigated" : "disabled"));
}
- return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t tsx_async_abort_show_state(char *buf)
{
if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) ||
(taa_mitigation == TAA_MITIGATION_OFF))
- return sprintf(buf, "%s\n", taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]);
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
- return sprintf(buf, "%s; SMT Host state unknown\n",
- taa_strings[taa_mitigation]);
+ return sysfs_emit(buf, "%s; SMT Host state unknown\n",
+ taa_strings[taa_mitigation]);
}
- return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
- sched_smt_active() ? "vulnerable" : "disabled");
+ return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation],
+ sched_smt_active() ? "vulnerable" : "disabled");
}
static ssize_t mmio_stale_data_show_state(char *buf)
@@ -2288,7 +2343,7 @@ static ssize_t mmio_stale_data_show_state(char *buf)
static char *stibp_state(void)
{
- if (spectre_v2_in_ibrs_mode(spectre_v2_enabled))
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
return "";
switch (spectre_v2_user_stibp) {
@@ -2334,73 +2389,72 @@ static char *pbrsb_eibrs_state(void)
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
- return sprintf(buf, "Vulnerable: LFENCE\n");
+ return sysfs_emit(buf, "Vulnerable: LFENCE\n");
if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
- return sprintf(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n");
if (sched_smt_active() && unprivileged_ebpf_enabled() &&
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
- return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
+ return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sprintf(buf, "%s%s%s%s%s%s%s\n",
- spectre_v2_strings[spectre_v2_enabled],
- ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
- stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
- pbrsb_eibrs_state(),
- spectre_v2_module_string());
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s\n",
+ spectre_v2_strings[spectre_v2_enabled],
+ ibpb_state(),
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ stibp_state(),
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ pbrsb_eibrs_state(),
+ spectre_v2_module_string());
}
static ssize_t srbds_show_state(char *buf)
{
- return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
+ return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
static ssize_t retbleed_show_state(char *buf)
{
if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
- boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
- return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
- return sprintf(buf, "%s; SMT %s\n",
- retbleed_strings[retbleed_mitigation],
- !sched_smt_active() ? "disabled" :
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
- spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
- "enabled with STIBP protection" : "vulnerable");
+ return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation],
+ !sched_smt_active() ? "disabled" :
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
+ spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ?
+ "enabled with STIBP protection" : "vulnerable");
}
- return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
+ return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
}
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
if (!boot_cpu_has_bug(bug))
- return sprintf(buf, "Not affected\n");
+ return sysfs_emit(buf, "Not affected\n");
switch (bug) {
case X86_BUG_CPU_MELTDOWN:
if (boot_cpu_has(X86_FEATURE_PTI))
- return sprintf(buf, "Mitigation: PTI\n");
+ return sysfs_emit(buf, "Mitigation: PTI\n");
if (hypervisor_is_type(X86_HYPER_XEN_PV))
- return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
+ return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n");
break;
case X86_BUG_SPECTRE_V1:
- return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
+ return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]);
case X86_BUG_SPECTRE_V2:
return spectre_v2_show_state(buf);
case X86_BUG_SPEC_STORE_BYPASS:
- return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
+ return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]);
case X86_BUG_L1TF:
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
@@ -2430,7 +2484,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
break;
}
- return sprintf(buf, "Vulnerable\n");
+ return sysfs_emit(buf, "Vulnerable\n");
}
ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 66556833d7af..4063e8991211 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -11,15 +11,19 @@
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/cpu.h>
+#include <linux/cpuhotplug.h>
#include <linux/sched.h>
#include <linux/capability.h>
#include <linux/sysfs.h>
#include <linux/pci.h>
+#include <linux/stop_machine.h>
#include <asm/cpufeature.h>
#include <asm/cacheinfo.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
#include "cpu.h"
@@ -35,6 +39,9 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
/* Shared L2 cache maps */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_l2c_shared_map);
+/* Kernel controls MTRR and/or PAT MSRs. */
+unsigned int memory_caching_control __ro_after_init;
+
struct _cache_table {
unsigned char descriptor;
char cache_type;
@@ -727,7 +734,7 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
void init_intel_cacheinfo(struct cpuinfo_x86 *c)
{
/* Cache sizes */
- unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0;
+ unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0;
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
@@ -828,9 +835,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
case LVL_3:
l3 += cache_table[k].size;
break;
- case LVL_TRACE:
- trace += cache_table[k].size;
- break;
}
break;
@@ -1040,3 +1044,175 @@ int populate_cache_leaves(unsigned int cpu)
return 0;
}
+
+/*
+ * Disable and enable caches. Needed for changing MTRRs and the PAT MSR.
+ *
+ * Since we are disabling the cache don't allow any interrupts,
+ * they would run extremely slow and would only increase the pain.
+ *
+ * The caller must ensure that local interrupts are disabled and
+ * are reenabled after cache_enable() has been called.
+ */
+static unsigned long saved_cr4;
+static DEFINE_RAW_SPINLOCK(cache_disable_lock);
+
+void cache_disable(void) __acquires(cache_disable_lock)
+{
+ unsigned long cr0;
+
+ /*
+ * Note that this is not ideal
+ * since the cache is only flushed/disabled for this CPU while the
+ * MTRRs are changed, but changing this requires more invasive
+ * changes to the way the kernel boots
+ */
+
+ raw_spin_lock(&cache_disable_lock);
+
+ /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
+ cr0 = read_cr0() | X86_CR0_CD;
+ write_cr0(cr0);
+
+ /*
+ * Cache flushing is the most time-consuming step when programming
+ * the MTRRs. Fortunately, as per the Intel Software Development
+ * Manual, we can skip it if the processor supports cache self-
+ * snooping.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+
+ /* Save value of CR4 and clear Page Global Enable (bit 7) */
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
+ saved_cr4 = __read_cr4();
+ __write_cr4(saved_cr4 & ~X86_CR4_PGE);
+ }
+
+ /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_disable();
+
+ /* Again, only flush caches if we have to. */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
+}
+
+void cache_enable(void) __releases(cache_disable_lock)
+{
+ /* Flush TLBs (no need to flush caches - they are disabled) */
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ flush_tlb_local();
+
+ if (cpu_feature_enabled(X86_FEATURE_MTRR))
+ mtrr_enable();
+
+ /* Enable caches */
+ write_cr0(read_cr0() & ~X86_CR0_CD);
+
+ /* Restore value of CR4 */
+ if (cpu_feature_enabled(X86_FEATURE_PGE))
+ __write_cr4(saved_cr4);
+
+ raw_spin_unlock(&cache_disable_lock);
+}
+
+static void cache_cpu_init(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cache_disable();
+
+ if (memory_caching_control & CACHE_MTRR)
+ mtrr_generic_set_state();
+
+ if (memory_caching_control & CACHE_PAT)
+ pat_cpu_init();
+
+ cache_enable();
+ local_irq_restore(flags);
+}
+
+static bool cache_aps_delayed_init = true;
+
+void set_cache_aps_delayed_init(bool val)
+{
+ cache_aps_delayed_init = val;
+}
+
+bool get_cache_aps_delayed_init(void)
+{
+ return cache_aps_delayed_init;
+}
+
+static int cache_rendezvous_handler(void *unused)
+{
+ if (get_cache_aps_delayed_init() || !cpu_online(smp_processor_id()))
+ cache_cpu_init();
+
+ return 0;
+}
+
+void __init cache_bp_init(void)
+{
+ mtrr_bp_init();
+ pat_bp_init();
+
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+void cache_bp_restore(void)
+{
+ if (memory_caching_control)
+ cache_cpu_init();
+}
+
+static int cache_ap_init(unsigned int cpu)
+{
+ if (!memory_caching_control || get_cache_aps_delayed_init())
+ return 0;
+
+ /*
+ * Ideally we should hold mtrr_mutex here to avoid MTRR entries
+ * changed, but this routine will be called in CPU boot time,
+ * holding the lock breaks it.
+ *
+ * This routine is called in two cases:
+ *
+ * 1. very early time of software resume, when there absolutely
+ * isn't MTRR entry changes;
+ *
+ * 2. CPU hotadd time. We let mtrr_add/del_page hold cpuhotplug
+ * lock to prevent MTRR entry changes
+ */
+ stop_machine_from_inactive_cpu(cache_rendezvous_handler, NULL,
+ cpu_callout_mask);
+
+ return 0;
+}
+
+/*
+ * Delayed cache initialization for all AP's
+ */
+void cache_aps_init(void)
+{
+ if (!memory_caching_control || !get_cache_aps_delayed_init())
+ return;
+
+ stop_machine(cache_rendezvous_handler, NULL, cpu_online_mask);
+ set_cache_aps_delayed_init(false);
+}
+
+static int __init cache_ap_register(void)
+{
+ cpuhp_setup_state_nocalls(CPUHP_AP_CACHECTRL_STARTING,
+ "x86/cachectrl:starting",
+ cache_ap_init, NULL);
+ return 0;
+}
+core_initcall(cache_ap_register);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3e508f239098..80710a68ef7d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,9 +22,9 @@
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
+#include <linux/stackprotector.h>
#include <asm/cmdline.h>
-#include <asm/stackprotector.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -52,6 +52,7 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/cacheinfo.h>
#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
@@ -120,6 +121,7 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
@@ -566,17 +568,18 @@ static __init int setup_disable_pku(char *arg)
return 1;
}
__setup("nopku", setup_disable_pku);
-#endif /* CONFIG_X86_64 */
+#endif
#ifdef CONFIG_X86_KERNEL_IBT
-__noendbr u64 ibt_save(void)
+__noendbr u64 ibt_save(bool disable)
{
u64 msr = 0;
if (cpu_feature_enabled(X86_FEATURE_IBT)) {
rdmsrl(MSR_IA32_S_CET, msr);
- wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
+ if (disable)
+ wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN);
}
return msr;
@@ -609,6 +612,7 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
if (!ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
+ wrmsrl(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
return;
}
@@ -701,16 +705,6 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- __loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#endif
-}
-
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
@@ -738,16 +732,45 @@ void load_fixmap_gdt(int cpu)
}
EXPORT_SYMBOL_GPL(load_fixmap_gdt);
-/*
- * Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one.
+/**
+ * switch_gdt_and_percpu_base - Switch to direct GDT and runtime per CPU base
+ * @cpu: The CPU number for which this is invoked
+ *
+ * Invoked during early boot to switch from early GDT and early per CPU to
+ * the direct GDT and the runtime per CPU area. On 32-bit the percpu base
+ * switch is implicit by loading the direct GDT. On 64bit this requires
+ * to update GSBASE.
*/
-void switch_to_new_gdt(int cpu)
+void __init switch_gdt_and_percpu_base(int cpu)
{
- /* Load the original GDT */
load_direct_gdt(cpu);
- /* Reload the per-cpu base */
- load_percpu_segment(cpu);
+
+#ifdef CONFIG_X86_64
+ /*
+ * No need to load %gs. It is already correct.
+ *
+ * Writing %gs on 64bit would zero GSBASE which would make any per
+ * CPU operation up to the point of the wrmsrl() fault.
+ *
+ * Set GSBASE to the new offset. Until the wrmsrl() happens the
+ * early mapping is still valid. That means the GSBASE update will
+ * lose any prior per CPU data which was not copied over in
+ * setup_per_cpu_areas().
+ *
+ * This works even with stackprotector enabled because the
+ * per CPU stack canary is 0 in both per CPU areas.
+ */
+ wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
+#else
+ /*
+ * %fs is already set to __KERNEL_PERCPU, but after switching GDT
+ * it is required to load FS again so that the 'hidden' part is
+ * updated from the new GDT. Up to this point the early per CPU
+ * translation is active. Any content of the early per CPU data
+ * which was not copied over in setup_per_cpu_areas() is lost.
+ */
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -1072,6 +1095,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000001f)
c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
+ if (c->extended_cpuid_level >= 0x80000021)
+ c->x86_capability[CPUID_8000_0021_EAX] = cpuid_eax(0x80000021);
+
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1205,8 +1231,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/* Zhaoxin Family 7 */
VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
@@ -1235,6 +1261,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define MMIO_SBDS BIT(2)
/* CPU is affected by RETbleed, speculating where you would not expect it */
#define RETBLEED BIT(3)
+/* CPU is affected by SMT (cross-thread) return predictions */
+#define SMT_RSB BIT(4)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
@@ -1266,8 +1294,8 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
- VULNBL_AMD(0x17, RETBLEED),
- VULNBL_HYGON(0x18, RETBLEED),
+ VULNBL_AMD(0x17, RETBLEED | SMT_RSB),
+ VULNBL_HYGON(0x18, RETBLEED | SMT_RSB),
{}
};
@@ -1317,8 +1345,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
- if (ia32_cap & ARCH_CAP_IBRS_ALL)
+ /*
+ * AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
+ * flag and protect from vendor-specific bugs via the whitelist.
+ */
+ if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
+ if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
+ !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ }
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
!(ia32_cap & ARCH_CAP_MDS_NO)) {
@@ -1380,10 +1416,8 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
- if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
- !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
- !(ia32_cap & ARCH_CAP_PBRSB_NO))
- setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
+ if (cpu_matches(cpu_vuln_blacklist, SMT_RSB))
+ setup_force_cpu_bug(X86_BUG_SMT_RSB);
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
@@ -1661,9 +1695,7 @@ void check_null_seg_clears_base(struct cpuinfo_x86 *c)
if (!IS_ENABLED(CONFIG_X86_64))
return;
- /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */
- if (c->extended_cpuid_level >= 0x80000021 &&
- cpuid_eax(0x80000021) & BIT(6))
+ if (cpu_has(c, X86_FEATURE_NULL_SEL_CLR_BASE))
return;
/*
@@ -1932,13 +1964,13 @@ void __init identify_boot_cpu(void)
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
pr_info("CET detected: Indirect Branch Tracking enabled\n");
#ifdef CONFIG_X86_32
- sysenter_setup();
enable_sep_cpu();
#endif
cpu_detect_tlb(&boot_cpu_data);
setup_cr_pinning();
tsx_init();
+ lkgs_init();
}
void identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1948,7 +1980,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
@@ -1993,27 +2024,18 @@ static __init int setup_clearcpuid(char *arg)
}
__setup("clearcpuid=", setup_clearcpuid);
+DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
+ .current_task = &init_task,
+ .preempt_count = INIT_PREEMPT_COUNT,
+ .top_of_stack = TOP_OF_INIT_STACK,
+};
+EXPORT_PER_CPU_SYMBOL(pcpu_hot);
+
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
-/*
- * The following percpu variables are hot. Align current_task to
- * cacheline size such that they fall in the same cacheline.
- */
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
-DEFINE_PER_CPU(void *, hardirq_stack_ptr);
-DEFINE_PER_CPU(bool, hardirq_stack_inuse);
-
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
-EXPORT_PER_CPU_SYMBOL(__preempt_count);
-
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
-
static void wrmsrl_cstar(unsigned long val)
{
/*
@@ -2064,20 +2086,6 @@ void syscall_init(void)
#else /* CONFIG_X86_64 */
-DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
-EXPORT_PER_CPU_SYMBOL(__preempt_count);
-
-/*
- * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
- * the top of the kernel stack. Use an extra percpu variable to track the
- * top of the kernel stack directly.
- */
-DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
- (unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
-
#ifdef CONFIG_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
@@ -2128,7 +2136,6 @@ static void wait_for_master_cpu(int cpu)
#endif
}
-#ifdef CONFIG_X86_64
static inline void setup_getcpu(int cpu)
{
unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu));
@@ -2150,6 +2157,7 @@ static inline void setup_getcpu(int cpu)
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S);
}
+#ifdef CONFIG_X86_64
static inline void ucode_cpu_init(int cpu)
{
if (cpu)
@@ -2169,8 +2177,6 @@ static inline void tss_setup_ist(struct tss_struct *tss)
#else /* CONFIG_X86_64 */
-static inline void setup_getcpu(int cpu) { }
-
static inline void ucode_cpu_init(int cpu)
{
show_ucode_info_early();
@@ -2248,12 +2254,6 @@ void cpu_init(void)
boot_cpu_has(X86_FEATURE_TSC) || boot_cpu_has(X86_FEATURE_DE))
cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
- /*
- * Initialize the per-CPU GDT with the boot GDT,
- * and set up the GDT descriptor:
- */
- switch_to_new_gdt(cpu);
-
if (IS_ENABLED(CONFIG_X86_64)) {
loadsegment(fs, 0);
memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -2306,30 +2306,45 @@ void cpu_init_secondary(void)
#endif
#ifdef CONFIG_MICROCODE_LATE_LOADING
-/*
+/**
+ * store_cpu_caps() - Store a snapshot of CPU capabilities
+ * @curr_info: Pointer where to store it
+ *
+ * Returns: None
+ */
+void store_cpu_caps(struct cpuinfo_x86 *curr_info)
+{
+ /* Reload CPUID max function as it might've changed. */
+ curr_info->cpuid_level = cpuid_eax(0);
+
+ /* Copy all capability leafs and pick up the synthetic ones. */
+ memcpy(&curr_info->x86_capability, &boot_cpu_data.x86_capability,
+ sizeof(curr_info->x86_capability));
+
+ /* Get the hardware CPUID leafs */
+ get_cpu_cap(curr_info);
+}
+
+/**
+ * microcode_check() - Check if any CPU capabilities changed after an update.
+ * @prev_info: CPU capabilities stored before an update.
+ *
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU
* hotplug lock.
+ *
+ * Return: None
*/
-void microcode_check(void)
+void microcode_check(struct cpuinfo_x86 *prev_info)
{
- struct cpuinfo_x86 info;
+ struct cpuinfo_x86 curr_info;
perf_check_microcode();
- /* Reload CPUID max function as it might've changed. */
- info.cpuid_level = cpuid_eax(0);
-
- /*
- * Copy all capability leafs to pick up the synthetic ones so that
- * memcmp() below doesn't fail on that. The ones coming from CPUID will
- * get overwritten in get_cpu_cap().
- */
- memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
-
- get_cpu_cap(&info);
+ store_cpu_caps(&curr_info);
- if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+ if (!memcmp(&prev_info->x86_capability, &curr_info.x86_capability,
+ sizeof(prev_info->x86_capability)))
return;
pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 7c9b5893c30a..f97b0fe13da8 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -83,6 +83,12 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
-extern u64 x86_read_arch_cap_msr(void);
-
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index c881bcafba7d..f6748c8bd647 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -68,6 +68,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL },
+ { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
@@ -75,6 +77,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
{ X86_FEATURE_SGX1, X86_FEATURE_SGX },
{ X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
+ { X86_FEATURE_SGX_EDECCSSA, X86_FEATURE_SGX1 },
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 21fd425088fe..5a2962c492d3 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c)
* msr_set_bit() uses the safe accessors, too, even if the MSR
* is not present.
*/
- msr_set_bit(MSR_F10H_DECFG,
- MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+ msr_set_bit(MSR_AMD64_DE_CFG,
+ MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT);
/* A serializing LFENCE stops RDTSC speculation */
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -339,7 +339,7 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARAT);
/* Hygon CPUs don't reset SS attributes on SYSRET, Xen does. */
- if (!cpu_has(c, X86_FEATURE_XENPV))
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 2d7ea5480ec3..1c4639588ff9 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -210,12 +210,154 @@ int intel_cpu_collect_info(struct ucode_cpu_info *uci)
csig.rev = intel_get_microcode_revision();
uci->cpu_sig = csig;
- uci->valid = 1;
return 0;
}
EXPORT_SYMBOL_GPL(intel_cpu_collect_info);
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+int intel_find_matching_signature(void *mc, unsigned int csig, int cpf)
+{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
+ int i;
+
+ if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
+
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
+
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_find_matching_signature);
+
+/**
+ * intel_microcode_sanity_check() - Sanity check microcode file.
+ * @mc: Pointer to the microcode file contents.
+ * @print_err: Display failure reason if true, silent if false.
+ * @hdr_type: Type of file, i.e. normal microcode file or In Field Scan file.
+ * Validate if the microcode header type matches with the type
+ * specified here.
+ *
+ * Validate certain header fields and verify if computed checksum matches
+ * with the one specified in the header.
+ *
+ * Return: 0 if the file passes all the checks, -EINVAL if any of the checks
+ * fail.
+ */
+int intel_microcode_sanity_check(void *mc, bool print_err, int hdr_type)
+{
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
+
+ if (mc_header->ldrver != 1 || mc_header->hdrver != hdr_type) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format. Header type %d\n",
+ mc_header->hdrver);
+ return -EINVAL;
+ }
+
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
+
+ if (ext_table_size < EXT_HEADER_SIZE ||
+ ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
+
+ /*
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
+ */
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
+ }
+
+ if (!ext_table_size)
+ return 0;
+
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(intel_microcode_sanity_check);
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -1034,7 +1176,31 @@ static const struct {
static struct ratelimit_state bld_ratelimit;
-static DEFINE_SEMAPHORE(buslock_sem);
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
static inline bool match_option(const char *arg, int arglen, const char *opt)
{
@@ -1146,12 +1312,20 @@ static void split_lock_init(void)
split_lock_verify_msr(sld_state != sld_off);
}
-static void __split_lock_reenable(struct work_struct *work)
+static void __split_lock_reenable_unlock(struct work_struct *work)
{
sld_update_msr(true);
up(&buslock_sem);
}
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+
/*
* If a CPU goes offline with pending delayed work to re-enable split lock
* detection then the delayed work will be executed on some other CPU. That
@@ -1169,10 +1343,9 @@ static int splitlock_cpu_offline(unsigned int cpu)
return 0;
}
-static DECLARE_DELAYED_WORK(split_lock_reenable, __split_lock_reenable);
-
static void split_lock_warn(unsigned long ip)
{
+ struct delayed_work *work;
int cpu;
if (!current->reported_split_lock)
@@ -1180,14 +1353,26 @@ static void split_lock_warn(unsigned long ip)
current->comm, current->pid, ip);
current->reported_split_lock = 1;
- /* misery factor #1, sleep 10ms before trying to execute split lock */
- if (msleep_interruptible(10) > 0)
- return;
- /* Misery factor #2, only allow one buslocked disabled core at a time */
- if (down_interruptible(&buslock_sem) == -EINTR)
- return;
+ if (sysctl_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ work = &sl_reenable_unlock;
+ } else {
+ work = &sl_reenable;
+ }
+
cpu = get_cpu();
- schedule_delayed_work_on(cpu, &split_lock_reenable, 2);
+ schedule_delayed_work_on(cpu, work, 2);
/* Disable split lock detection on this CPU to make progress */
sld_update_msr(false);
@@ -1266,31 +1451,13 @@ void handle_bus_lock(struct pt_regs *regs)
}
/*
- * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
- * only be trusted if it is confirmed that a CPU model implements a
- * specific feature at a particular bit position.
- *
- * The possible driver data field values:
- *
- * - 0: CPU models that are known to have the per-core split-lock detection
- * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
- *
- * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
- * bit 5 to enumerate the per-core split-lock detection feature.
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
*/
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
{}
};
@@ -1302,24 +1469,27 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
+ /* Check for CPUs that have support but do not enumerate it: */
m = x86_match_cpu(split_lock_cpu_ids);
- if (!m)
- return;
+ if (m)
+ goto supported;
- switch (m->driver_data) {
- case 0:
- break;
- case 1:
- if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
- return;
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
- if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
- return;
- break;
- default:
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
return;
- }
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
cpu_model_supports_sld = true;
__split_lock_setup();
}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index fbaf12e43f41..3b8476158236 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,7 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 7),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 1c87501e0fa3..0b971f974096 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -235,10 +235,10 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
* A list of the banks enabled on each logical CPU. Controls which respective
* descriptors to initialize later in mce_threshold_create_device().
*/
-static DEFINE_PER_CPU(unsigned int, bank_map);
+static DEFINE_PER_CPU(u64, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
-static DEFINE_PER_CPU(u32, smca_misc_banks_map);
+static DEFINE_PER_CPU(u64, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
@@ -267,7 +267,7 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
return;
if (low & MASK_BLKPTR_LO)
- per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
+ per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank);
}
@@ -306,6 +306,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu)
if ((low & BIT(5)) && !((high >> 5) & 0x3))
high |= BIT(5);
+ this_cpu_ptr(mce_banks_array)[bank].lsb_in_status = !!(low & BIT(8));
+
wrmsr(smca_config, low, high);
}
@@ -528,7 +530,7 @@ static u32 smca_get_block_address(unsigned int bank, unsigned int block,
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
- if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
+ if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank)))
return 0;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
@@ -572,7 +574,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int new;
if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
memset(&b, 0, sizeof(b));
b.cpu = cpu;
@@ -736,15 +738,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
if (m.status & MCI_STATUS_ADDRV) {
m.addr = addr;
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m.addr >> 56) & 0x3f;
-
- m.addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(&m);
}
if (mce_flags.smca) {
@@ -788,6 +782,24 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
return status & MCI_STATUS_DEFERRED;
}
+static bool _log_error_deferred(unsigned int bank, u32 misc)
+{
+ if (!_log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
+ mca_msr_reg(bank, MCA_ADDR), misc))
+ return false;
+
+ /*
+ * Non-SMCA systems don't have MCA_DESTAT/MCA_DEADDR registers.
+ * Return true here to avoid accessing these registers.
+ */
+ if (!mce_flags.smca)
+ return true;
+
+ /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+ return true;
+}
+
/*
* We have three scenarios for checking for Deferred errors:
*
@@ -799,20 +811,9 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc)
*/
static void log_error_deferred(unsigned int bank)
{
- bool defrd;
-
- defrd = _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS),
- mca_msr_reg(bank, MCA_ADDR), 0);
-
- if (!mce_flags.smca)
+ if (_log_error_deferred(bank, 0))
return;
- /* Clear MCA_DESTAT if we logged the deferred error from MCA_STATUS. */
- if (defrd) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
- return;
- }
-
/*
* Only deferred errors are logged in MCA_DE{STAT,ADDR} so just check
* for a valid error.
@@ -832,7 +833,7 @@ static void amd_deferred_error_interrupt(void)
static void log_error_thresholding(unsigned int bank, u64 misc)
{
- _log_error_bank(bank, mca_msr_reg(bank, MCA_STATUS), mca_msr_reg(bank, MCA_ADDR), misc);
+ _log_error_deferred(bank, misc);
}
static void log_and_reset_block(struct threshold_block *block)
@@ -877,7 +878,7 @@ static void amd_threshold_interrupt(void)
return;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
continue;
first_block = bp[bank]->blocks;
@@ -1028,7 +1029,7 @@ static const struct sysfs_ops threshold_ops = {
static void threshold_block_release(struct kobject *kobj);
-static struct kobj_type threshold_ktype = {
+static const struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_groups = default_groups,
.release = threshold_block_release,
@@ -1355,7 +1356,7 @@ int mce_threshold_create_device(unsigned int cpu)
return -ENOMEM;
for (bank = 0; bank < numbanks; ++bank) {
- if (!(this_cpu_read(bank_map) & (1 << bank)))
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
if (err) {
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2c8ec5c71712..2eec60f50057 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -67,13 +67,7 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
-struct mce_bank {
- u64 ctl; /* subevents to enable */
-
- __u64 init : 1, /* initialise bank? */
- __reserved_1 : 63;
-};
-static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
#define ATTR_LEN 16
/* One object for each MCE bank, shared by all CPUs */
@@ -579,7 +573,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
mce->severity != MCE_DEFERRED_SEVERITY)
return NOTIFY_DONE;
- pfn = mce->addr >> PAGE_SHIFT;
+ pfn = (mce->addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
set_mce_nospec(pfn);
mce->kflags |= MCE_HANDLED_UC;
@@ -633,15 +627,7 @@ static noinstr void mce_read_aux(struct mce *m, int i)
m->addr <<= shift;
}
- /*
- * Extract [55:<lsb>] where lsb is the least significant
- * *valid* bit of the address bits.
- */
- if (mce_flags.smca) {
- u8 lsb = (m->addr >> 56) & 0x3f;
-
- m->addr &= GENMASK_ULL(55, lsb);
- }
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
@@ -1308,6 +1294,7 @@ static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
+ unsigned long pfn;
int ret;
p->mce_count = 0;
@@ -1316,9 +1303,10 @@ static void kill_me_maybe(struct callback_head *cb)
if (!p->mce_ripv)
flags |= MF_MUST_KILL;
- ret = memory_failure(p->mce_addr >> PAGE_SHIFT, flags);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ ret = memory_failure(pfn, flags);
if (!ret) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(pfn);
sync_core();
return;
}
@@ -1340,11 +1328,13 @@ static void kill_me_maybe(struct callback_head *cb)
static void kill_me_never(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
+ unsigned long pfn;
p->mce_count = 0;
pr_err("Kernel accessed poison in user space at %llx\n", p->mce_addr);
- if (!memory_failure(p->mce_addr >> PAGE_SHIFT, 0))
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ pfn = (p->mce_addr & MCI_ADDR_PHYSADDR) >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
}
static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
@@ -2365,6 +2355,7 @@ static void mce_restart(void)
{
mce_timer_delete_all();
on_each_cpu(mce_cpu_restart, NULL, 1);
+ mce_schedule_work();
}
/* Toggle features for corrected errors */
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 100fbeebdc72..a05ac0716ecf 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -105,8 +105,7 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
{
char *p;
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
+ strscpy(mce_helper, buf, sizeof(mce_helper));
p = strchr(mce_helper, '\n');
if (p)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 7e03f5b7f6bd..d2412ce2d312 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -177,6 +177,24 @@ struct mce_vendor_flags {
extern struct mce_vendor_flags mce_flags;
+struct mce_bank {
+ /* subevents to enable */
+ u64 ctl;
+
+ /* initialise bank? */
+ __u64 init : 1,
+
+ /*
+ * (AMD) MCA_CONFIG[McaLsbInStatusSupported]: When set, this bit indicates
+ * the LSB field is found in MCA_STATUS and not in MCA_ADDR.
+ */
+ lsb_in_status : 1,
+
+ __reserved_1 : 62;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
+
enum mca_msr {
MCA_CTL,
MCA_STATUS,
@@ -189,8 +207,34 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
+
+/*
+ * If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
+ * [56:0] of MCA_STATUS, else in bits [55:0] of MCA_ADDR.
+ */
+static __always_inline void smca_extract_err_addr(struct mce *m)
+{
+ u8 lsb;
+
+ if (!mce_flags.smca)
+ return;
+
+ if (this_cpu_ptr(mce_banks_array)[m->bank].lsb_in_status) {
+ lsb = (m->status >> 24) & 0x3f;
+
+ m->addr &= GENMASK_ULL(56, lsb);
+
+ return;
+ }
+
+ lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+}
+
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
+static inline void smca_extract_err_addr(struct mce *m) { }
#endif
#ifdef CONFIG_X86_ANCIENT_MCE
@@ -200,11 +244,11 @@ noinstr void pentium_machine_check(struct pt_regs *regs);
noinstr void winchip_machine_check(struct pt_regs *regs);
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
-static inline void pentium_machine_check(struct pt_regs *regs) {}
-static inline void winchip_machine_check(struct pt_regs *regs) {}
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
noinstr u64 mce_rdmsrl(u32 msr);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 00483d1c27e4..c4477162c07d 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -203,6 +203,11 @@ static struct severity {
BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
),
MCESEV(
+ PANIC, "Uncorrected in kernel",
+ BITSET(MCI_STATUS_UC),
+ KERNEL
+ ),
+ MCESEV(
UC, "Uncorrected",
BITSET(MCI_STATUS_UC)
),
@@ -391,9 +396,6 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
*msg = s->msg;
s->covered = 1;
- if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL)
- return MCE_PANIC_SEVERITY;
-
return s->sev;
}
}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3a35dec3ec55..f5fdeb1e3606 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -55,11 +55,13 @@ struct cont_desc {
};
static u32 ucode_new_rev;
-static u8 amd_ucode_patch[PATCH_MAX_SIZE];
+
+/* One blob per node. */
+static u8 amd_ucode_patch[MAX_NUMNODES][PATCH_MAX_SIZE];
/*
* Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/microcode.rst
+ * format. See Documentation/arch/x86/microcode.rst
*/
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
@@ -330,8 +332,9 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size, true);
if (ret < 0) {
/*
- * Patch verification failed, skip to the next
- * container, if there's one:
+ * Patch verification failed, skip to the next container, if
+ * there is one. Before exit, check whether that container has
+ * found a patch already. If so, use it.
*/
goto out;
} else if (ret > 0) {
@@ -350,6 +353,7 @@ skip:
size -= patch_size + SECTION_HDR_SIZE;
}
+out:
/*
* If we have found a patch (desc->mc), it means we're looking at the
* container which has a patch for this CPU so return 0 to mean, @ucode
@@ -364,7 +368,6 @@ skip:
return 0;
}
-out:
return orig_size - size;
}
@@ -414,8 +417,7 @@ static int __apply_microcode_amd(struct microcode_amd *mc)
*
* Returns true if container found (sets @desc), false otherwise.
*/
-static bool
-apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
+static bool early_apply_microcode(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
{
struct cont_desc desc = { 0 };
u8 (*patch)[PATCH_MAX_SIZE];
@@ -428,7 +430,7 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
- patch = &amd_ucode_patch;
+ patch = &amd_ucode_patch[0];
#endif
desc.cpuid_1_eax = cpuid_1_eax;
@@ -481,7 +483,7 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static void find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
struct ucode_cpu_info *uci;
struct cpio_data cp;
@@ -511,11 +513,11 @@ void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
{
struct cpio_data cp = { };
- __load_ucode_amd(cpuid_1_eax, &cp);
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
+ early_apply_microcode(cpuid_1_eax, cp.data, cp.size, true);
}
void load_ucode_amd_ap(unsigned int cpuid_1_eax)
@@ -546,15 +548,14 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
}
- __load_ucode_amd(cpuid_1_eax, &cp);
+ find_blobs_in_containers(cpuid_1_eax, &cp);
if (!(cp.data && cp.size))
return;
- apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
+ early_apply_microcode(cpuid_1_eax, cp.data, cp.size, false);
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -572,19 +573,19 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
+ ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret > UCODE_UPDATED)
return -EINVAL;
return 0;
}
-void reload_ucode_amd(void)
+void reload_ucode_amd(unsigned int cpu)
{
- struct microcode_amd *mc;
u32 rev, dummy __always_unused;
+ struct microcode_amd *mc;
- mc = (struct microcode_amd *)amd_ucode_patch;
+ mc = (struct microcode_amd *)amd_ucode_patch[cpu_to_node(cpu)];
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
@@ -816,6 +817,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
return 0;
}
+/* Scan the blob in @data and add microcode patches to the cache. */
static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
size_t size)
{
@@ -850,9 +852,10 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state
-load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
struct ucode_patch *p;
enum ucode_state ret;
@@ -865,22 +868,22 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
return ret;
}
- p = find_patch(0);
- if (!p) {
- return ret;
- } else {
- if (boot_cpu_data.microcode >= p->patch_id)
- return ret;
+ for_each_node(nid) {
+ cpu = cpumask_first(cpumask_of_node(nid));
+ c = &cpu_data(cpu);
- ret = UCODE_NEW;
- }
+ p = find_patch(cpu);
+ if (!p)
+ continue;
- /* save BSP's matching patch for early load */
- if (!save)
- return ret;
+ if (c->microcode >= p->patch_id)
+ continue;
- memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
- memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
+ ret = UCODE_NEW;
+
+ memset(&amd_ucode_patch[nid], 0, PATCH_MAX_SIZE);
+ memcpy(&amd_ucode_patch[nid], p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
+ }
return ret;
}
@@ -901,19 +904,13 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
*
* These might be larger than 2K.
*/
-static enum ucode_state request_microcode_amd(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
- bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
- /* reload ucode container only on the boot cpu */
- if (!refresh_fw || !bsp)
- return UCODE_OK;
-
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -926,7 +923,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
if (!verify_container(fw->data, fw->size, false))
goto fw_release;
- ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6a41cee242f6..3afcf3de0dd4 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -298,7 +298,7 @@ struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
#endif
}
-void reload_early_microcode(void)
+void reload_early_microcode(unsigned int cpu)
{
int vendor, family;
@@ -312,67 +312,13 @@ void reload_early_microcode(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- reload_ucode_amd();
+ reload_ucode_amd(cpu);
break;
default:
break;
}
}
-static void collect_cpu_info_local(void *arg)
-{
- struct cpu_info_ctx *ctx = arg;
-
- ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
- ctx->cpu_sig);
-}
-
-static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
-{
- struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
- int ret;
-
- ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
- if (!ret)
- ret = ctx.err;
-
- return ret;
-}
-
-static int collect_cpu_info(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- int ret;
-
- memset(uci, 0, sizeof(*uci));
-
- ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
- if (!ret)
- uci->valid = 1;
-
- return ret;
-}
-
-static void apply_microcode_local(void *arg)
-{
- enum ucode_state *err = arg;
-
- *err = microcode_ops->apply_microcode(smp_processor_id());
-}
-
-static int apply_microcode_on_target(int cpu)
-{
- enum ucode_state err;
- int ret;
-
- ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
- if (!ret) {
- if (err == UCODE_ERROR)
- ret = 1;
- }
- return ret;
-}
-
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
@@ -458,15 +404,15 @@ static int __reload_late(void *info)
* below.
*/
if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
- apply_microcode_local(&err);
+ err = microcode_ops->apply_microcode(cpu);
else
goto wait_for_siblings;
if (err >= UCODE_NFOUND) {
- if (err == UCODE_ERROR)
+ if (err == UCODE_ERROR) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
-
- ret = -1;
+ ret = -1;
+ }
}
wait_for_siblings:
@@ -480,7 +426,7 @@ wait_for_siblings:
* revision.
*/
if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu)
- apply_microcode_local(&err);
+ err = microcode_ops->apply_microcode(cpu);
return ret;
}
@@ -492,6 +438,7 @@ wait_for_siblings:
static int microcode_reload_late(void)
{
int old = boot_cpu_data.microcode, ret;
+ struct cpuinfo_x86 prev_info;
pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
pr_err("You should switch to early loading, if possible.\n");
@@ -499,12 +446,21 @@ static int microcode_reload_late(void)
atomic_set(&late_cpus_in, 0);
atomic_set(&late_cpus_out, 0);
- ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
- if (ret == 0)
- microcode_check();
+ /*
+ * Take a snapshot before the microcode update in order to compare and
+ * check whether any bits changed after an update.
+ */
+ store_cpu_caps(&prev_info);
- pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n",
- old, boot_cpu_data.microcode);
+ ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+ if (!ret) {
+ pr_info("Reload succeeded, microcode revision: 0x%x -> 0x%x\n",
+ old, boot_cpu_data.microcode);
+ microcode_check(&prev_info);
+ } else {
+ pr_info("Reload failed, current microcode revision: 0x%x\n",
+ boot_cpu_data.microcode);
+ }
return ret;
}
@@ -519,11 +475,8 @@ static ssize_t reload_store(struct device *dev,
ssize_t ret = 0;
ret = kstrtoul(buf, 0, &val);
- if (ret)
- return ret;
-
- if (val != 1)
- return size;
+ if (ret || val != 1)
+ return -EINVAL;
cpus_read_lock();
@@ -531,7 +484,7 @@ static ssize_t reload_store(struct device *dev,
if (ret)
goto put;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev);
if (tmp_ret != UCODE_NEW)
goto put;
@@ -561,7 +514,7 @@ static ssize_t version_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct device *dev,
+static ssize_t processor_flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
@@ -569,8 +522,8 @@ static ssize_t pf_show(struct device *dev,
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static DEVICE_ATTR(version, 0444, version_show, NULL);
-static DEVICE_ATTR(processor_flags, 0444, pf_show, NULL);
+static DEVICE_ATTR_RO(version);
+static DEVICE_ATTR_RO(processor_flags);
static struct attribute *mc_default_attrs[] = {
&dev_attr_version.attr,
@@ -589,91 +542,17 @@ static void microcode_fini_cpu(int cpu)
microcode_ops->microcode_fini_cpu(cpu);
}
-static enum ucode_state microcode_resume_cpu(int cpu)
-{
- if (apply_microcode_on_target(cpu))
- return UCODE_ERROR;
-
- pr_debug("CPU%d updated upon resume\n", cpu);
-
- return UCODE_OK;
-}
-
-static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
-{
- enum ucode_state ustate;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (uci->valid)
- return UCODE_OK;
-
- if (collect_cpu_info(cpu))
- return UCODE_ERROR;
-
- /* --dimm. Trigger a delayed update? */
- if (system_state != SYSTEM_RUNNING)
- return UCODE_NFOUND;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, refresh_fw);
- if (ustate == UCODE_NEW) {
- pr_debug("CPU%d updated upon init\n", cpu);
- apply_microcode_on_target(cpu);
- }
-
- return ustate;
-}
-
-static enum ucode_state microcode_update_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- /* Refresh CPU microcode revision after resume. */
- collect_cpu_info(cpu);
-
- if (uci->valid)
- return microcode_resume_cpu(cpu);
-
- return microcode_init_cpu(cpu, false);
-}
-
-static int mc_device_add(struct device *dev, struct subsys_interface *sif)
-{
- int err, cpu = dev->id;
-
- if (!cpu_online(cpu))
- return 0;
-
- pr_debug("CPU%d added\n", cpu);
-
- err = sysfs_create_group(&dev->kobj, &mc_attr_group);
- if (err)
- return err;
-
- if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
- return -EINVAL;
-
- return err;
-}
+ memset(uci, 0, sizeof(*uci));
-static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
-{
- int cpu = dev->id;
+ microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
- if (!cpu_online(cpu))
- return;
-
- pr_debug("CPU%d removed\n", cpu);
- microcode_fini_cpu(cpu);
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ return microcode_ops->apply_microcode(cpu);
}
-static struct subsys_interface mc_cpu_interface = {
- .name = "microcode",
- .subsys = &cpu_subsys,
- .add_dev = mc_device_add,
- .remove_dev = mc_device_remove,
-};
-
/**
* microcode_bsp_resume - Update boot CPU microcode during resume.
*/
@@ -682,21 +561,23 @@ void microcode_bsp_resume(void)
int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- if (uci->valid && uci->mc)
+ if (uci->mc)
microcode_ops->apply_microcode(cpu);
- else if (!uci->mc)
- reload_early_microcode();
+ else
+ reload_early_microcode(cpu);
}
static struct syscore_ops mc_syscore_ops = {
- .resume = microcode_bsp_resume,
+ .resume = microcode_bsp_resume,
};
static int mc_cpu_starting(unsigned int cpu)
{
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- return 0;
+ enum ucode_state err = microcode_ops->apply_microcode(cpu);
+
+ pr_debug("%s: CPU%d, err: %d\n", __func__, cpu, err);
+
+ return err == UCODE_ERROR;
}
static int mc_cpu_online(unsigned int cpu)
@@ -713,13 +594,30 @@ static int mc_cpu_down_prep(unsigned int cpu)
struct device *dev;
dev = get_cpu_device(cpu);
+
+ microcode_fini_cpu(cpu);
+
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
+ pr_debug("%s: CPU%d\n", __func__, cpu);
return 0;
}
+static void setup_online_cpu(struct work_struct *work)
+{
+ int cpu = smp_processor_id();
+ enum ucode_state err;
+
+ err = microcode_init_cpu(cpu);
+ if (err == UCODE_ERROR) {
+ pr_err("Error applying microcode on CPU%d\n", cpu);
+ return;
+ }
+
+ mc_cpu_online(cpu);
+}
+
static struct attribute *cpu_root_microcode_attrs[] = {
#ifdef CONFIG_MICROCODE_LATE_LOADING
&dev_attr_reload.attr,
@@ -734,6 +632,7 @@ static const struct attribute_group cpu_root_microcode_group = {
static int __init microcode_init(void)
{
+ struct device *dev_root;
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;
@@ -750,28 +649,23 @@ static int __init microcode_init(void)
if (!microcode_ops)
return -ENODEV;
- microcode_pdev = platform_device_register_simple("microcode", -1,
- NULL, 0);
+ microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0);
if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
- error = subsys_interface_register(&mc_cpu_interface);
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
-
- if (error)
- goto out_pdev;
-
- error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpu_root_microcode_group);
-
- if (error) {
- pr_err("Error creating microcode group!\n");
- goto out_driver;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ error = sysfs_create_group(&dev_root->kobj, &cpu_root_microcode_group);
+ put_device(dev_root);
+ if (error) {
+ pr_err("Error creating microcode group!\n");
+ goto out_pdev;
+ }
}
+ /* Do per-CPU setup */
+ schedule_on_each_cpu(setup_online_cpu);
+
register_syscore_ops(&mc_syscore_ops);
cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting",
mc_cpu_starting, NULL);
@@ -782,15 +676,6 @@ static int __init microcode_init(void)
return 0;
- out_driver:
- cpus_read_lock();
- mutex_lock(&microcode_mutex);
-
- subsys_interface_unregister(&mc_cpu_interface);
-
- mutex_unlock(&microcode_mutex);
- cpus_read_unlock();
-
out_pdev:
platform_device_unregister(microcode_pdev);
return error;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 1fcbd671f1df..467cf37ea90a 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -48,34 +48,6 @@ static int llc_size_per_core;
/*
* Returns 1 if update has been found, 0 otherwise.
*/
-static int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
struct microcode_header_intel *mc_hdr = mc;
@@ -83,7 +55,7 @@ static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev
if (mc_hdr->rev <= new_rev)
return 0;
- return find_matching_signature(mc, csig, cpf);
+ return intel_find_matching_signature(mc, csig, cpf);
}
static struct ucode_patch *memdup_patch(void *data, unsigned int size)
@@ -117,7 +89,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
sig = mc_saved_hdr->sig;
pf = mc_saved_hdr->pf;
- if (find_matching_signature(data, sig, pf)) {
+ if (intel_find_matching_signature(data, sig, pf)) {
prev_found = true;
if (mc_hdr->rev <= mc_saved_hdr->rev)
@@ -149,7 +121,7 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
if (!p)
return;
- if (!find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
+ if (!intel_find_matching_signature(p->data, uci->cpu_sig.sig, uci->cpu_sig.pf))
return;
/*
@@ -163,104 +135,6 @@ static void save_microcode_patch(struct ucode_cpu_info *uci, void *data, unsigne
intel_ucode_patch = p->data;
}
-static int microcode_sanity_check(void *mc, int print_err)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
@@ -281,13 +155,13 @@ scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
mc_size = get_totalsize(mc_header);
if (!mc_size ||
mc_size > size ||
- microcode_sanity_check(data, 0) < 0)
+ intel_microcode_sanity_check(data, false, MC_HEADER_TYPE_MICROCODE) < 0)
break;
size -= mc_size;
- if (!find_matching_signature(data, uci->cpu_sig.sig,
- uci->cpu_sig.pf)) {
+ if (!intel_find_matching_signature(data, uci->cpu_sig.sig,
+ uci->cpu_sig.pf)) {
data += mc_size;
continue;
}
@@ -431,14 +305,11 @@ static bool load_builtin_intel_microcode(struct cpio_data *cp)
return false;
}
-/*
- * Print ucode update info.
- */
-static void
-print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
+static void print_ucode_info(int old_rev, int new_rev, unsigned int date)
{
- pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
- uci->cpu_sig.rev,
+ pr_info_once("updated early: 0x%x -> 0x%x, date = %04x-%02x-%02x\n",
+ old_rev,
+ new_rev,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
@@ -448,6 +319,7 @@ print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
static int delay_ucode_info;
static int current_mc_date;
+static int early_old_rev;
/*
* Print early updated ucode info after printk works. This is delayed info dump.
@@ -458,7 +330,7 @@ void show_ucode_info_early(void)
if (delay_ucode_info) {
intel_cpu_collect_info(&uci);
- print_ucode_info(&uci, current_mc_date);
+ print_ucode_info(early_old_rev, uci.cpu_sig.rev, current_mc_date);
delay_ucode_info = 0;
}
}
@@ -467,40 +339,32 @@ void show_ucode_info_early(void)
* At this point, we can not call printk() yet. Delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
-static void print_ucode(struct ucode_cpu_info *uci)
+static void print_ucode(int old_rev, int new_rev, int date)
{
- struct microcode_intel *mc;
int *delay_ucode_info_p;
int *current_mc_date_p;
-
- mc = uci->mc;
- if (!mc)
- return;
+ int *early_old_rev_p;
delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
+ early_old_rev_p = (int *)__pa_nodebug(&early_old_rev);
*delay_ucode_info_p = 1;
- *current_mc_date_p = mc->hdr.date;
+ *current_mc_date_p = date;
+ *early_old_rev_p = old_rev;
}
#else
-static inline void print_ucode(struct ucode_cpu_info *uci)
+static inline void print_ucode(int old_rev, int new_rev, int date)
{
- struct microcode_intel *mc;
-
- mc = uci->mc;
- if (!mc)
- return;
-
- print_ucode_info(uci, mc->hdr.date);
+ print_ucode_info(old_rev, new_rev, date);
}
#endif
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc;
- u32 rev;
+ u32 rev, old_rev;
mc = uci->mc;
if (!mc)
@@ -517,6 +381,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
return UCODE_OK;
}
+ old_rev = rev;
+
/*
* Writeback and invalidate caches before updating microcode to avoid
* internal issues depending on what the microcode is updating.
@@ -533,9 +399,9 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
uci->cpu_sig.rev = rev;
if (early)
- print_ucode(uci);
+ print_ucode(old_rev, uci->cpu_sig.rev, mc->hdr.date);
else
- print_ucode_info(uci, mc->hdr.date);
+ print_ucode_info(old_rev, uci->cpu_sig.rev, mc->hdr.date);
return 0;
}
@@ -621,7 +487,6 @@ void load_ucode_intel_ap(void)
else
iup = &intel_ucode_patch;
-reget:
if (!*iup) {
patch = __load_ucode_intel(&uci);
if (!patch)
@@ -632,12 +497,7 @@ reget:
uci.mc = *iup;
- if (apply_microcode_early(&uci, true)) {
- /* Mixed-silicon system? Try to refetch the proper patch: */
- *iup = NULL;
-
- goto reget;
- }
+ apply_microcode_early(&uci, true);
}
static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
@@ -652,9 +512,9 @@ static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
if (phdr->rev <= uci->cpu_sig.rev)
continue;
- if (!find_matching_signature(phdr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf))
+ if (!intel_find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
continue;
return iter->data;
@@ -680,7 +540,6 @@ void reload_ucode_intel(void)
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
- static struct cpu_signature prev;
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
unsigned int val[2];
@@ -696,13 +555,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->rev = c->microcode;
- /* No extra locking on prev, races are harmless. */
- if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) {
- pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n",
- csig->sig, csig->pf, csig->rev);
- prev = *csig;
- }
-
return 0;
}
@@ -820,7 +672,7 @@ static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter)
memcpy(mc, &mc_header, sizeof(mc_header));
data = mc + sizeof(mc_header);
if (!copy_from_iter_full(data, data_size, iter) ||
- microcode_sanity_check(mc, 1) < 0) {
+ intel_microcode_sanity_check(mc, true, MC_HEADER_TYPE_MICROCODE) < 0) {
break;
}
@@ -885,8 +737,7 @@ static bool is_blacklisted(unsigned int cpu)
return false;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device,
- bool refresh_fw)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
@@ -908,7 +759,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
kvec.iov_base = (void *)firmware->data;
kvec.iov_len = firmware->size;
- iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size);
+ iov_iter_kvec(&iter, ITER_SOURCE, &kvec, 1, firmware->size);
ret = generic_load_microcode(cpu, &iter);
release_firmware(firmware);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 831613959a92..c7969e806c64 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,7 +18,6 @@
#include <linux/kexec.h>
#include <linux/i8253.h>
#include <linux/random.h>
-#include <linux/swiotlb.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv-tlfs.h>
@@ -33,13 +32,79 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
-#include <asm/coco.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
+/* Is Linux running on nested Microsoft Hypervisor */
+bool hv_nested;
struct ms_hyperv_info ms_hyperv;
#if IS_ENABLED(CONFIG_HYPERV)
+static inline unsigned int hv_get_nested_reg(unsigned int reg)
+{
+ if (hv_is_sint_reg(reg))
+ return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0;
+
+ switch (reg) {
+ case HV_REGISTER_SIMP:
+ return HV_REGISTER_NESTED_SIMP;
+ case HV_REGISTER_SIEFP:
+ return HV_REGISTER_NESTED_SIEFP;
+ case HV_REGISTER_SVERSION:
+ return HV_REGISTER_NESTED_SVERSION;
+ case HV_REGISTER_SCONTROL:
+ return HV_REGISTER_NESTED_SCONTROL;
+ case HV_REGISTER_EOM:
+ return HV_REGISTER_NESTED_EOM;
+ default:
+ return reg;
+ }
+}
+
+u64 hv_get_non_nested_register(unsigned int reg)
+{
+ u64 value;
+
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
+ hv_ghcb_msr_read(reg, &value);
+ else
+ rdmsrl(reg, value);
+ return value;
+}
+EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
+
+void hv_set_non_nested_register(unsigned int reg, u64 value)
+{
+ if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
+ hv_ghcb_msr_write(reg, value);
+
+ /* Write proxy bit via wrmsl instruction */
+ if (hv_is_sint_reg(reg))
+ wrmsrl(reg, value | 1 << 20);
+ } else {
+ wrmsrl(reg, value);
+ }
+}
+EXPORT_SYMBOL_GPL(hv_set_non_nested_register);
+
+u64 hv_get_register(unsigned int reg)
+{
+ if (hv_nested)
+ reg = hv_get_nested_reg(reg);
+
+ return hv_get_non_nested_register(reg);
+}
+EXPORT_SYMBOL_GPL(hv_get_register);
+
+void hv_set_register(unsigned int reg, u64 value)
+{
+ if (hv_nested)
+ reg = hv_get_nested_reg(reg);
+
+ hv_set_non_nested_register(reg, value);
+}
+EXPORT_SYMBOL_GPL(hv_set_register);
+
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
@@ -183,11 +248,6 @@ static uint32_t __init ms_hyperv_platform(void)
return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
}
-static unsigned char hv_get_nmi_reason(void)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
@@ -291,16 +351,25 @@ static void __init ms_hyperv_init_platform(void)
* To mirror what Windows does we should extract CPU management
* features and use the ReservedIdentityBit to detect if Linux is the
* root partition. But that requires negotiating CPU management
- * interface (a process to be finalized).
+ * interface (a process to be finalized). For now, use the privilege
+ * flag as the indicator for running as root.
*
- * For now, use the privilege flag as the indicator for running as
- * root.
+ * Hyper-V should never specify running as root and as a Confidential
+ * VM. But to protect against a compromised/malicious Hyper-V trying
+ * to exploit root behavior to expose Confidential VM memory, ignore
+ * the root partition setting if also a Confidential VM.
*/
- if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
+ if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) &&
+ !(ms_hyperv.priv_high & HV_ISOLATION)) {
hv_root_partition = true;
pr_info("Hyper-V: running as root partition\n");
}
+ if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
+ hv_nested = true;
+ pr_info("Hyper-V: running on a nested hypervisor\n");
+ }
+
/*
* Extract host information.
*/
@@ -325,23 +394,16 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
- ms_hyperv.shared_gpa_boundary =
- BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
static_branch_enable(&isolation_type_snp);
-#ifdef CONFIG_SWIOTLB
- swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
-#endif
- }
- /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
- cc_set_vendor(CC_VENDOR_HYPERV);
- }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -388,7 +450,7 @@ static void __init ms_hyperv_init_platform(void)
* setting of this MSR bit should happen before init_intel()
* is called.
*/
- wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
@@ -410,6 +472,9 @@ static void __init ms_hyperv_init_platform(void)
i8253_clear_counter_on_shutdown = false;
#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
*/
@@ -449,6 +514,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ hv_vtl_init_platform();
#endif
/*
* TSC should be marked as unstable only after Hyper-V
@@ -475,6 +541,12 @@ static bool __init ms_hyperv_x2apic_available(void)
* (logically) generates MSIs directly to the system APIC irq domain.
* There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
* pci-hyperv host bridge.
+ *
+ * Note: for a Hyper-V root partition, this will always return false.
+ * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
+ * default, they are implemented as intercepts by the Windows Hyper-V stack.
+ * Even a nested root partition (L2 root) will not get them because the
+ * nested (L1) hypervisor filters them out.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index a65a0272096d..eff6ac62c0ff 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -109,7 +109,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
return 0;
}
-static const struct mtrr_ops amd_mtrr_ops = {
+const struct mtrr_ops amd_mtrr_ops = {
.vendor = X86_VENDOR_AMD,
.set = amd_set_mtrr,
.get = amd_get_mtrr,
@@ -117,9 +117,3 @@ static const struct mtrr_ops amd_mtrr_ops = {
.validate_add_page = amd_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init amd_init_mtrr(void)
-{
- set_mtrr_ops(&amd_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index f27177816569..b8a74eddde83 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -111,7 +111,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
return 0;
}
-static const struct mtrr_ops centaur_mtrr_ops = {
+const struct mtrr_ops centaur_mtrr_ops = {
.vendor = X86_VENDOR_CENTAUR,
.set = centaur_set_mcr,
.get = centaur_get_mcr,
@@ -119,9 +119,3 @@ static const struct mtrr_ops centaur_mtrr_ops = {
.validate_add_page = centaur_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init centaur_init_mtrr(void)
-{
- set_mtrr_ops(&centaur_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index ca670919b561..173b9e01e623 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -234,51 +234,11 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
post_set();
}
-typedef struct {
- unsigned long base;
- unsigned long size;
- mtrr_type type;
-} arr_state_t;
-
-static arr_state_t arr_state[8] = {
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL},
- {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}
-};
-
-static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 };
-
-static void cyrix_set_all(void)
-{
- int i;
-
- prepare_set();
-
- /* the CCRs are not contiguous */
- for (i = 0; i < 4; i++)
- setCx86(CX86_CCR0 + i, ccr_state[i]);
- for (; i < 7; i++)
- setCx86(CX86_CCR4 + i, ccr_state[i]);
-
- for (i = 0; i < 8; i++) {
- cyrix_set_arr(i, arr_state[i].base,
- arr_state[i].size, arr_state[i].type);
- }
-
- post_set();
-}
-
-static const struct mtrr_ops cyrix_mtrr_ops = {
+const struct mtrr_ops cyrix_mtrr_ops = {
.vendor = X86_VENDOR_CYRIX,
- .set_all = cyrix_set_all,
.set = cyrix_set_arr,
.get = cyrix_get_arr,
.get_free_region = cyrix_get_free_region,
.validate_add_page = generic_validate_add_page,
.have_wrcomb = positive_have_wrcomb,
};
-
-int __init cyrix_init_mtrr(void)
-{
- set_mtrr_ops(&cyrix_mtrr_ops);
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 558108296f3c..ee09d359e08f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <asm/processor-flags.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
@@ -396,9 +397,6 @@ print_fixed(unsigned base, unsigned step, const mtrr_type *types)
}
}
-static void prepare_set(void);
-static void post_set(void);
-
static void __init print_mtrr_state(void)
{
unsigned int i;
@@ -444,20 +442,6 @@ static void __init print_mtrr_state(void)
pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20);
}
-/* PAT setup for BP. We need to go through sync steps here */
-void __init mtrr_bp_pat_init(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
-
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-}
-
/* Grab all of the MTRR state for this CPU into *state */
bool __init get_mtrr_state(void)
{
@@ -684,7 +668,10 @@ static u32 deftype_lo, deftype_hi;
/**
* set_mtrr_state - Set the MTRR state for this CPU.
*
- * NOTE: The CPU must already be in a safe state for MTRR changes.
+ * NOTE: The CPU must already be in a safe state for MTRR changes, including
+ * measures that only a single CPU can be active in set_mtrr_state() in
+ * order to not be subject to races for usage of deftype_lo. This is
+ * accomplished by taking cache_disable_lock.
* RETURNS: 0 if no changes made, else a mask indicating what was changed.
*/
static unsigned long set_mtrr_state(void)
@@ -715,106 +702,34 @@ static unsigned long set_mtrr_state(void)
return change_mask;
}
-
-static unsigned long cr4;
-static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
-
-/*
- * Since we are disabling the cache don't allow any interrupts,
- * they would run extremely slow and would only increase the pain.
- *
- * The caller must ensure that local interrupts are disabled and
- * are reenabled after post_set() has been called.
- */
-static void prepare_set(void) __acquires(set_atomicity_lock)
+void mtrr_disable(void)
{
- unsigned long cr0;
-
- /*
- * Note that this is not ideal
- * since the cache is only flushed/disabled for this CPU while the
- * MTRRs are changed, but changing this requires more invasive
- * changes to the way the kernel boots
- */
-
- raw_spin_lock(&set_atomicity_lock);
-
- /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
- cr0 = read_cr0() | X86_CR0_CD;
- write_cr0(cr0);
-
- /*
- * Cache flushing is the most time-consuming step when programming
- * the MTRRs. Fortunately, as per the Intel Software Development
- * Manual, we can skip it if the processor supports cache self-
- * snooping.
- */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
-
- /* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (boot_cpu_has(X86_FEATURE_PGE)) {
- cr4 = __read_cr4();
- __write_cr4(cr4 & ~X86_CR4_PGE);
- }
-
- /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
-
- /* Again, only flush caches if we have to. */
- if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
- wbinvd();
}
-static void post_set(void) __releases(set_atomicity_lock)
+void mtrr_enable(void)
{
- /* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- flush_tlb_local();
-
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
-
- /* Enable caches */
- write_cr0(read_cr0() & ~X86_CR0_CD);
-
- /* Restore value of CR4 */
- if (boot_cpu_has(X86_FEATURE_PGE))
- __write_cr4(cr4);
- raw_spin_unlock(&set_atomicity_lock);
}
-static void generic_set_all(void)
+void mtrr_generic_set_state(void)
{
unsigned long mask, count;
- unsigned long flags;
-
- local_irq_save(flags);
- prepare_set();
/* Actually set the state */
mask = set_mtrr_state();
- /* also set PAT */
- pat_init();
-
- post_set();
- local_irq_restore(flags);
-
/* Use the atomic bitops to update the global mask */
for (count = 0; count < sizeof(mask) * 8; ++count) {
if (mask & 0x01)
set_bit(count, &smp_changes_mask);
mask >>= 1;
}
-
}
/**
@@ -836,7 +751,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
vr = &mtrr_state.var_ranges[reg];
local_irq_save(flags);
- prepare_set();
+ cache_disable();
if (size == 0) {
/*
@@ -855,7 +770,7 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi);
}
- post_set();
+ cache_enable();
local_irq_restore(flags);
}
@@ -914,8 +829,6 @@ int positive_have_wrcomb(void)
* Generic structure...
*/
const struct mtrr_ops generic_mtrr_ops = {
- .use_intel_if = 1,
- .set_all = generic_set_all,
.get = generic_get_mtrr,
.get_free_region = generic_get_free_region,
.set = generic_set_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 2746cac9d8a9..783f3210d582 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -46,6 +46,7 @@
#include <linux/syscore_ops.h>
#include <linux/rcupdate.h>
+#include <asm/cacheinfo.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
@@ -58,32 +59,18 @@
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
-static bool __mtrr_enabled;
-
static bool mtrr_enabled(void)
{
- return __mtrr_enabled;
+ return !!mtrr_if;
}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
-static bool mtrr_aps_delayed_init;
-
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
const struct mtrr_ops *mtrr_if;
-static void set_mtrr(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type);
-
-void __init set_mtrr_ops(const struct mtrr_ops *ops)
-{
- if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
- mtrr_ops[ops->vendor] = ops;
-}
-
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb(void)
{
@@ -119,11 +106,11 @@ static int have_wrcomb(void)
}
/* This function returns the number of variable MTRRs */
-static void __init set_num_var_ranges(void)
+static void __init set_num_var_ranges(bool use_generic)
{
unsigned long config = 0, dummy;
- if (use_intel())
+ if (use_generic)
rdmsr(MSR_MTRRcap, config, dummy);
else if (is_cpu(AMD) || is_cpu(HYGON))
config = 2;
@@ -160,25 +147,8 @@ static int mtrr_rendezvous_handler(void *info)
{
struct set_mtrr_data *data = info;
- /*
- * We use this same function to initialize the mtrrs during boot,
- * resume, runtime cpu online and on an explicit request to set a
- * specific MTRR.
- *
- * During boot or suspend, the state of the boot cpu's mtrrs has been
- * saved, and we want to replicate that across all the cpus that come
- * online (either at the end of boot or resume or during a runtime cpu
- * online). If we're doing that, @reg is set to something special and on
- * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
- * started the boot/resume sequence, this might be a duplicate
- * set_all()).
- */
- if (data->smp_reg != ~0U) {
- mtrr_if->set(data->smp_reg, data->smp_base,
- data->smp_size, data->smp_type);
- } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
- mtrr_if->set_all();
- }
+ mtrr_if->set(data->smp_reg, data->smp_base,
+ data->smp_size, data->smp_type);
return 0;
}
@@ -248,19 +218,6 @@ static void set_mtrr_cpuslocked(unsigned int reg, unsigned long base,
stop_machine_cpuslocked(mtrr_rendezvous_handler, &data, cpu_online_mask);
}
-static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
-{
- struct set_mtrr_data data = { .smp_reg = reg,
- .smp_base = base,
- .smp_size = size,
- .smp_type = type
- };
-
- stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
- cpu_callout_mask);
-}
-
/**
* mtrr_add_page - Add a memory type region
* @base: Physical base address of region in pages (in units of 4 kB!)
@@ -617,20 +574,6 @@ int arch_phys_wc_index(int handle)
}
EXPORT_SYMBOL_GPL(arch_phys_wc_index);
-/*
- * HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-static void __init init_ifs(void)
-{
-#ifndef CONFIG_X86_64
- amd_init_mtrr();
- cyrix_init_mtrr();
- centaur_init_mtrr();
-#endif
-}
-
/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
* MTRR driver doesn't require this
*/
@@ -686,10 +629,9 @@ int __initdata changed_by_mtrr_cleanup;
*/
void __init mtrr_bp_init(void)
{
+ const char *why = "(not available)";
u32 phys_addr;
- init_ifs();
-
phys_addr = 32;
if (boot_cpu_has(X86_FEATURE_MTRR)) {
@@ -730,21 +672,21 @@ void __init mtrr_bp_init(void)
case X86_VENDOR_AMD:
if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
- mtrr_if = mtrr_ops[X86_VENDOR_AMD];
+ mtrr_if = &amd_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CENTAUR:
if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
+ mtrr_if = &centaur_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
- mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
+ mtrr_if = &cyrix_mtrr_ops;
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
@@ -754,58 +696,23 @@ void __init mtrr_bp_init(void)
}
}
- if (mtrr_if) {
- __mtrr_enabled = true;
- set_num_var_ranges();
+ if (mtrr_enabled()) {
+ set_num_var_ranges(mtrr_if == &generic_mtrr_ops);
init_table();
- if (use_intel()) {
+ if (mtrr_if == &generic_mtrr_ops) {
/* BIOS may override */
- __mtrr_enabled = get_mtrr_state();
-
- if (mtrr_enabled())
- mtrr_bp_pat_init();
-
- if (mtrr_cleanup(phys_addr)) {
- changed_by_mtrr_cleanup = 1;
- mtrr_if->set_all();
+ if (get_mtrr_state()) {
+ memory_caching_control |= CACHE_MTRR;
+ changed_by_mtrr_cleanup = mtrr_cleanup(phys_addr);
+ } else {
+ mtrr_if = NULL;
+ why = "by BIOS";
}
}
}
- if (!mtrr_enabled()) {
- pr_info("Disabled\n");
-
- /*
- * PAT initialization relies on MTRR's rendezvous handler.
- * Skip PAT init until the handler can initialize both
- * features independently.
- */
- pat_disable("MTRRs disabled, skipping PAT initialization too.");
- }
-}
-
-void mtrr_ap_init(void)
-{
if (!mtrr_enabled())
- return;
-
- if (!use_intel() || mtrr_aps_delayed_init)
- return;
-
- /*
- * Ideally we should hold mtrr_mutex here to avoid mtrr entries
- * changed, but this routine will be called in cpu boot time,
- * holding the lock breaks it.
- *
- * This routine is called in two cases:
- *
- * 1. very early time of software resume, when there absolutely
- * isn't mtrr entry changes;
- *
- * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
- * lock to prevent mtrr entry changes
- */
- set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
+ pr_info("MTRRs disabled %s\n", why);
}
/**
@@ -823,50 +730,12 @@ void mtrr_save_state(void)
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
}
-void set_mtrr_aps_delayed_init(void)
-{
- if (!mtrr_enabled())
- return;
- if (!use_intel())
- return;
-
- mtrr_aps_delayed_init = true;
-}
-
-/*
- * Delayed MTRR initialization for all AP's
- */
-void mtrr_aps_init(void)
-{
- if (!use_intel() || !mtrr_enabled())
- return;
-
- /*
- * Check if someone has requested the delay of AP MTRR initialization,
- * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
- * then we are done.
- */
- if (!mtrr_aps_delayed_init)
- return;
-
- set_mtrr(~0U, 0, 0, 0);
- mtrr_aps_delayed_init = false;
-}
-
-void mtrr_bp_restore(void)
-{
- if (!use_intel() || !mtrr_enabled())
- return;
-
- mtrr_if->set_all();
-}
-
static int __init mtrr_init_finialize(void)
{
if (!mtrr_enabled())
return 0;
- if (use_intel()) {
+ if (memory_caching_control & CACHE_MTRR) {
if (!changed_by_mtrr_cleanup)
mtrr_state_warn();
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2ac99e561181..02eb5871492d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -14,11 +14,8 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
- u32 use_intel_if;
void (*set)(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
- void (*set_all)(void);
-
void (*get)(unsigned int reg, unsigned long *base,
unsigned long *size, mtrr_type *type);
int (*get_free_region)(unsigned long base, unsigned long size,
@@ -53,15 +50,11 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
bool get_mtrr_state(void);
-void mtrr_bp_pat_init(void);
-
-extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
-#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
extern u64 mtrr_tom2;
@@ -71,10 +64,10 @@ void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
void mtrr_wrmsr(unsigned, unsigned, unsigned);
-/* CPU specific mtrr init functions */
-int amd_init_mtrr(void);
-int cyrix_init_mtrr(void);
-int centaur_init_mtrr(void);
+/* CPU specific mtrr_ops vectors. */
+extern const struct mtrr_ops amd_mtrr_ops;
+extern const struct mtrr_ops cyrix_mtrr_ops;
+extern const struct mtrr_ops centaur_mtrr_ops;
extern int changed_by_mtrr_cleanup;
extern int mtrr_cleanup(unsigned address_bits);
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 3266ea36667c..030d3b409768 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -100,6 +100,18 @@ struct rdt_hw_resource rdt_resources_all[] = {
.fflags = RFTYPE_RES_MB,
},
},
+ [RDT_RESOURCE_SMBA] =
+ {
+ .r_resctrl = {
+ .rid = RDT_RESOURCE_SMBA,
+ .name = "SMBA",
+ .cache_level = 3,
+ .domains = domain_init(RDT_RESOURCE_SMBA),
+ .parse_ctrlval = parse_bw,
+ .format_str = "%d=%*u",
+ .fflags = RFTYPE_RES_MB,
+ },
+ },
};
/*
@@ -150,6 +162,13 @@ bool is_mba_sc(struct rdt_resource *r)
if (!r)
return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc;
+ /*
+ * The software controller support is only applicable to MBA resource.
+ * Make sure to check for resource type.
+ */
+ if (r->rid != RDT_RESOURCE_MBA)
+ return false;
+
return r->membw.mba_sc;
}
@@ -213,9 +232,15 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
union cpuid_0x10_x_edx edx;
- u32 ebx, ecx;
+ u32 ebx, ecx, subleaf;
+
+ /*
+ * Query CPUID_Fn80000020_EDX_x01 for MBA and
+ * CPUID_Fn80000020_EDX_x02 for SMBA
+ */
+ subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, 1, &eax.full, &ebx, &ecx, &edx.full);
+ cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
hw_res->num_closid = edx.split.cos_max + 1;
r->default_ctrl = MAX_MBA_BW_AMD;
@@ -575,7 +600,7 @@ static void clear_closid_rmid(int cpu)
state->default_rmid = 0;
state->cur_closid = 0;
state->cur_rmid = 0;
- wrmsr(IA32_PQR_ASSOC, 0, 0);
+ wrmsr(MSR_IA32_PQR_ASSOC, 0, 0);
}
static int resctrl_online_cpu(unsigned int cpu)
@@ -647,6 +672,8 @@ enum {
RDT_FLAG_L2_CAT,
RDT_FLAG_L2_CDP,
RDT_FLAG_MBA,
+ RDT_FLAG_SMBA,
+ RDT_FLAG_BMEC,
};
#define RDT_OPT(idx, n, f) \
@@ -670,6 +697,8 @@ static struct rdt_options rdt_options[] __initdata = {
RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2),
RDT_OPT(RDT_FLAG_L2_CDP, "l2cdp", X86_FEATURE_CDP_L2),
RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA),
+ RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA),
+ RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC),
};
#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
@@ -699,7 +728,7 @@ static int __init set_rdt_options(char *str)
}
__setup("rdt", set_rdt_options);
-static bool __init rdt_cpu_has(int flag)
+bool __init rdt_cpu_has(int flag)
{
bool ret = boot_cpu_has(flag);
struct rdt_options *o;
@@ -734,6 +763,19 @@ static __init bool get_mem_config(void)
return false;
}
+static __init bool get_slow_mem_config(void)
+{
+ struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_SMBA];
+
+ if (!rdt_cpu_has(X86_FEATURE_SMBA))
+ return false;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return __rdt_get_mem_config_amd(&hw_res->r_resctrl);
+
+ return false;
+}
+
static __init bool get_rdt_alloc_resources(void)
{
struct rdt_resource *r;
@@ -764,6 +806,9 @@ static __init bool get_rdt_alloc_resources(void)
if (get_mem_config())
ret = true;
+ if (get_slow_mem_config())
+ ret = true;
+
return ret;
}
@@ -828,7 +873,6 @@ static __init void rdt_init_res_defs_intel(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = false;
- r->cache.arch_has_empty_bitmaps = false;
r->cache.arch_has_per_cpu_cfg = false;
r->cache.min_cbm_bits = 1;
} else if (r->rid == RDT_RESOURCE_MBA) {
@@ -849,12 +893,14 @@ static __init void rdt_init_res_defs_amd(void)
if (r->rid == RDT_RESOURCE_L3 ||
r->rid == RDT_RESOURCE_L2) {
r->cache.arch_has_sparse_bitmaps = true;
- r->cache.arch_has_empty_bitmaps = true;
r->cache.arch_has_per_cpu_cfg = true;
r->cache.min_cbm_bits = 0;
} else if (r->rid == RDT_RESOURCE_MBA) {
hw_res->msr_base = MSR_IA32_MBA_BW_BASE;
hw_res->msr_update = mba_wrmsr_amd;
+ } else if (r->rid == RDT_RESOURCE_SMBA) {
+ hw_res->msr_base = MSR_IA32_SMBA_BW_BASE;
+ hw_res->msr_update = mba_wrmsr_amd;
}
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 1dafbdc5ac31..b44c487727d4 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -105,8 +105,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return false;
}
- if ((!r->cache.arch_has_empty_bitmaps && val == 0) ||
- val > r->default_ctrl) {
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) {
rdt_last_cmd_puts("Mask out of range\n");
return false;
}
@@ -210,7 +209,7 @@ static int parse_line(char *line, struct resctrl_schema *s,
unsigned long dom_id;
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
- r->rid == RDT_RESOURCE_MBA) {
+ (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
return -EINVAL;
}
@@ -311,7 +310,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
enum resctrl_conf_type t;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int cpu;
u32 idx;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
@@ -342,13 +340,9 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
if (cpumask_empty(cpu_mask))
goto done;
- cpu = get_cpu();
- /* Update resource control msr on this CPU if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update resource control msr on other CPUs. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
+
+ /* Update resource control msr on all the CPUs. */
+ on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
done:
free_cpumask_var(cpu_mask);
@@ -374,7 +368,6 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
{
struct resctrl_schema *s;
struct rdtgroup *rdtgrp;
- struct rdt_domain *dom;
struct rdt_resource *r;
char *tok, *resname;
int ret = 0;
@@ -403,10 +396,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
goto out;
}
- list_for_each_entry(s, &resctrl_schema_all, list) {
- list_for_each_entry(dom, &s->res->domains, list)
- memset(dom->staged_config, 0, sizeof(dom->staged_config));
- }
+ rdt_staged_configs_clear();
while ((tok = strsep(&buf, "\n")) != NULL) {
resname = strim(strsep(&tok, ":"));
@@ -451,6 +441,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
}
out:
+ rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
cpus_read_unlock();
return ret ?: nbytes;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 5f7128686cfd..85ceaf9a31ac 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -8,16 +8,6 @@
#include <linux/fs_context.h>
#include <linux/jump_label.h>
-#define MSR_IA32_L3_QOS_CFG 0xc81
-#define MSR_IA32_L2_QOS_CFG 0xc82
-#define MSR_IA32_L3_CBM_BASE 0xc90
-#define MSR_IA32_L2_CBM_BASE 0xd10
-#define MSR_IA32_MBA_THRTL_BASE 0xd50
-#define MSR_IA32_MBA_BW_BASE 0xc0000200
-
-#define MSR_IA32_QM_CTR 0x0c8e
-#define MSR_IA32_QM_EVTSEL 0x0c8d
-
#define L3_QOS_CDP_ENABLE 0x01ULL
#define L2_QOS_CDP_ENABLE 0x01ULL
@@ -40,6 +30,29 @@
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
+/* Reads to Local DRAM Memory */
+#define READS_TO_LOCAL_MEM BIT(0)
+
+/* Reads to Remote DRAM Memory */
+#define READS_TO_REMOTE_MEM BIT(1)
+
+/* Non-Temporal Writes to Local Memory */
+#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2)
+
+/* Non-Temporal Writes to Remote Memory */
+#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3)
+
+/* Reads to Local Memory the system identifies as "Slow Memory" */
+#define READS_TO_LOCAL_S_MEM BIT(4)
+
+/* Reads to Remote Memory the system identifies as "Slow Memory" */
+#define READS_TO_REMOTE_S_MEM BIT(5)
+
+/* Dirty Victims to All Types of Memory */
+#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6)
+
+/* Max event bits supported */
+#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
struct rdt_fs_context {
struct kernfs_fs_context kfc;
@@ -62,11 +75,13 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
* @name: name of the event
+ * @configurable: true if the event is configurable
* @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
enum resctrl_event_id evtid;
char *name;
+ bool configurable;
struct list_head list;
};
@@ -419,6 +434,7 @@ enum resctrl_res_level {
RDT_RESOURCE_L3,
RDT_RESOURCE_L2,
RDT_RESOURCE_MBA,
+ RDT_RESOURCE_SMBA,
/* Must be the last */
RDT_NUM_RESOURCES,
@@ -521,6 +537,7 @@ void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
+bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
@@ -537,5 +554,7 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
void __init thread_throttle_mode_init(void);
+void __init mbm_config_rftype_init(const char *config);
+void rdt_staged_configs_clear(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index efe0c30d3a12..ded1fc7cb7cb 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -76,7 +76,7 @@ unsigned int resctrl_rmid_realloc_limit;
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
/*
- * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
* by the correction factor.
*
@@ -146,6 +146,30 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
return entry;
}
+static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+{
+ u64 msr_val;
+
+ /*
+ * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
+ * with a valid event code for supported resource type and the bits
+ * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
+ * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
+ * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
+ * are error bits.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ *val = msr_val;
+ return 0;
+}
+
static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
u32 rmid,
enum resctrl_event_id eventid)
@@ -172,8 +196,29 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
struct arch_mbm_state *am;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
- if (am)
+ if (am) {
memset(am, 0, sizeof(*am));
+
+ /* Record any initial, non-zero count value. */
+ __rmid_read(rmid, eventid, &am->prev_msr);
+ }
+}
+
+/*
+ * Assumes that hardware counters are also reset and thus that there is
+ * no need to record initial non-zero counts.
+ */
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d)
+{
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+
+ if (is_mbm_total_enabled())
+ memset(hw_dom->arch_mbm_total, 0,
+ sizeof(*hw_dom->arch_mbm_total) * r->num_rmid);
+
+ if (is_mbm_local_enabled())
+ memset(hw_dom->arch_mbm_local, 0,
+ sizeof(*hw_dom->arch_mbm_local) * r->num_rmid);
}
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
@@ -191,25 +236,14 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct arch_mbm_state *am;
u64 msr_val, chunks;
+ int ret;
if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
return -EINVAL;
- /*
- * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
- * with a valid event code for supported resource type and the bits
- * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
- * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
- * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
- * are error bits.
- */
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, msr_val);
-
- if (msr_val & RMID_VAL_ERROR)
- return -EIO;
- if (msr_val & RMID_VAL_UNAVAIL)
- return -EINVAL;
+ ret = __rmid_read(rmid, eventid, &msr_val);
+ if (ret)
+ return ret;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
@@ -349,41 +383,36 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
+static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
+ enum resctrl_event_id evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &d->mbm_total[rmid];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &d->mbm_local[rmid];
+ default:
+ return NULL;
+ }
+}
+
static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
struct mbm_state *m;
u64 tval = 0;
- if (rr->first)
+ if (rr->first) {
resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
+ m = get_mbm_state(rr->d, rmid, rr->evtid);
+ if (m)
+ memset(m, 0, sizeof(struct mbm_state));
+ return 0;
+ }
rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
if (rr->err)
return rr->err;
- switch (rr->evtid) {
- case QOS_L3_OCCUP_EVENT_ID:
- rr->val += tval;
- return 0;
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- m = &rr->d->mbm_total[rmid];
- break;
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- m = &rr->d->mbm_local[rmid];
- break;
- default:
- /*
- * Code would never reach here because an invalid
- * event id would fail in resctrl_arch_rmid_read().
- */
- return -EINVAL;
- }
-
- if (rr->first) {
- memset(m, 0, sizeof(struct mbm_state));
- return 0;
- }
-
rr->val += tval;
return 0;
@@ -746,7 +775,7 @@ static void l3_mon_evt_init(struct rdt_resource *r)
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
-int rdt_get_mon_l3_config(struct rdt_resource *r)
+int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -783,6 +812,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
if (ret)
return ret;
+ if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
+ mbm_total_event.configurable = true;
+ mbm_config_rftype_init("mbm_total_bytes_config");
+ }
+ if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
+ mbm_local_event.configurable = true;
+ mbm_config_rftype_init("mbm_local_bytes_config");
+ }
+ }
+
l3_mon_evt_init(r);
r->mon_capable = true;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index d961ae3ed96e..458cb7419502 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -477,7 +477,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* pseudo-locked followed by reading of kernel memory to load it
* into the cache.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
/*
* Cache was flushed earlier. Now access kernel memory to read it
* into cache region associated with just activated plr->closid.
@@ -513,7 +513,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* Critical section end: restore closid with capacity bitmask that
* does not overlap with pseudo-locked region.
*/
- __wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
+ __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p);
/* Re-enable the hardware prefetcher(s) */
wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr);
@@ -1560,9 +1560,9 @@ static const struct file_operations pseudo_lock_dev_fops = {
.mmap = pseudo_lock_dev_mmap,
};
-static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
+static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
{
- struct rdtgroup *rdtgrp;
+ const struct rdtgroup *rdtgrp;
rdtgrp = dev_get_drvdata(dev);
if (mode)
@@ -1580,7 +1580,7 @@ int rdt_pseudo_lock_init(void)
pseudo_lock_major = ret;
- pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
+ pseudo_lock_class = class_create("pseudo_lock");
if (IS_ERR(pseudo_lock_class)) {
ret = PTR_ERR(pseudo_lock_class);
unregister_chrdev(pseudo_lock_major, "pseudo_lock");
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index e5a48f05e787..6ad33f355861 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -78,6 +78,19 @@ void rdt_last_cmd_printf(const char *fmt, ...)
va_end(ap);
}
+void rdt_staged_configs_clear(void)
+{
+ struct rdt_resource *r;
+ struct rdt_domain *dom;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for_each_alloc_capable_rdt_resource(r) {
+ list_for_each_entry(dom, &r->domains, list)
+ memset(dom->staged_config, 0, sizeof(dom->staged_config));
+ }
+}
+
/*
* Trivial allocator for CLOSIDs. Since h/w only supports a small number,
* we can keep a bitmap of free CLOSIDs in a single integer.
@@ -314,7 +327,7 @@ static void update_cpu_closid_rmid(void *info)
* executing task might have its own closid selected. Just reuse
* the context switch code.
*/
- resctrl_sched_in();
+ resctrl_sched_in(current);
}
/*
@@ -325,12 +338,7 @@ static void update_cpu_closid_rmid(void *info)
static void
update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
{
- int cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, cpu_mask))
- update_cpu_closid_rmid(r);
- smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
- put_cpu();
+ on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1);
}
static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
@@ -535,7 +543,7 @@ static void _update_task_closid_rmid(void *task)
* Otherwise, the MSR is updated when the task is scheduled in.
*/
if (task == current)
- resctrl_sched_in();
+ resctrl_sched_in(task);
}
static void update_task_closid_rmid(struct task_struct *t)
@@ -580,8 +588,10 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
/*
* Ensure the task's closid and rmid are written before determining if
* the task is current that will decide if it will be interrupted.
+ * This pairs with the full barrier between the rq->curr update and
+ * resctrl_sched_in() during context switch.
*/
- barrier();
+ smp_mb();
/*
* By now, the task's closid and rmid are set. If the task is current
@@ -1001,8 +1011,11 @@ static int rdt_mon_features_show(struct kernfs_open_file *of,
struct rdt_resource *r = of->kn->parent->priv;
struct mon_evt *mevt;
- list_for_each_entry(mevt, &r->evt_list, list)
+ list_for_each_entry(mevt, &r->evt_list, list) {
seq_printf(seq, "%s\n", mevt->name);
+ if (mevt->configurable)
+ seq_printf(seq, "%s_config\n", mevt->name);
+ }
return 0;
}
@@ -1213,7 +1226,7 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- if (r->rid == RDT_RESOURCE_MBA)
+ if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
continue;
has_cache = true;
list_for_each_entry(d, &r->domains, list) {
@@ -1402,7 +1415,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
ctrl = resctrl_arch_get_config(r, d,
closid,
type);
- if (r->rid == RDT_RESOURCE_MBA)
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA)
size = ctrl;
else
size = rdtgroup_cbm_to_size(r, d, ctrl);
@@ -1419,6 +1433,248 @@ out:
return ret;
}
+struct mon_config_info {
+ u32 evtid;
+ u32 mon_config;
+};
+
+#define INVALID_CONFIG_INDEX UINT_MAX
+
+/**
+ * mon_event_config_index_get - get the hardware index for the
+ * configurable event
+ * @evtid: event id.
+ *
+ * Return: 0 for evtid == QOS_L3_MBM_TOTAL_EVENT_ID
+ * 1 for evtid == QOS_L3_MBM_LOCAL_EVENT_ID
+ * INVALID_CONFIG_INDEX for invalid evtid
+ */
+static inline unsigned int mon_event_config_index_get(u32 evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return 0;
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return 1;
+ default:
+ /* Should never reach here */
+ return INVALID_CONFIG_INDEX;
+ }
+}
+
+static void mon_event_config_read(void *info)
+{
+ struct mon_config_info *mon_info = info;
+ unsigned int index;
+ u64 msrval;
+
+ index = mon_event_config_index_get(mon_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ return;
+ }
+ rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval);
+
+ /* Report only the valid event configuration bits */
+ mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
+}
+
+static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+{
+ smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+}
+
+static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+{
+ struct mon_config_info mon_info = {0};
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ memset(&mon_info, 0, sizeof(struct mon_config_info));
+ mon_info.evtid = evtid;
+ mondata_config_read(dom, &mon_info);
+
+ seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ return 0;
+}
+
+static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ return 0;
+}
+
+static void mon_event_config_write(void *info)
+{
+ struct mon_config_info *mon_info = info;
+ unsigned int index;
+
+ index = mon_event_config_index_get(mon_info->evtid);
+ if (index == INVALID_CONFIG_INDEX) {
+ pr_warn_once("Invalid event id %d\n", mon_info->evtid);
+ return;
+ }
+ wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
+}
+
+static int mbm_config_write_domain(struct rdt_resource *r,
+ struct rdt_domain *d, u32 evtid, u32 val)
+{
+ struct mon_config_info mon_info = {0};
+ int ret = 0;
+
+ /* mon_config cannot be more than the supported set of events */
+ if (val > MAX_EVT_CONFIG_BITS) {
+ rdt_last_cmd_puts("Invalid event configuration\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Read the current config value first. If both are the same then
+ * no need to write it again.
+ */
+ mon_info.evtid = evtid;
+ mondata_config_read(d, &mon_info);
+ if (mon_info.mon_config == val)
+ goto out;
+
+ mon_info.mon_config = val;
+
+ /*
+ * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
+ * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
+ * are scoped at the domain level. Writing any of these MSRs
+ * on one CPU is observed by all the CPUs in the domain.
+ */
+ smp_call_function_any(&d->cpu_mask, mon_event_config_write,
+ &mon_info, 1);
+
+ /*
+ * When an Event Configuration is changed, the bandwidth counters
+ * for all RMIDs and Events will be cleared by the hardware. The
+ * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+ * every RMID on the next read to any event for every RMID.
+ * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+ * cleared while it is tracked by the hardware. Clear the
+ * mbm_local and mbm_total counts for all the RMIDs.
+ */
+ resctrl_arch_reset_rmid_all(r, d);
+
+out:
+ return ret;
+}
+
+static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
+{
+ char *dom_str = NULL, *id_str;
+ unsigned long dom_id, val;
+ struct rdt_domain *d;
+ int ret = 0;
+
+next:
+ if (!tok || tok[0] == '\0')
+ return 0;
+
+ /* Start processing the strings for each domain */
+ dom_str = strim(strsep(&tok, ";"));
+ id_str = strsep(&dom_str, "=");
+
+ if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
+ return -EINVAL;
+ }
+
+ if (!dom_str || kstrtoul(dom_str, 16, &val)) {
+ rdt_last_cmd_puts("Non-numeric event configuration value\n");
+ return -EINVAL;
+ }
+
+ list_for_each_entry(d, &r->domains, list) {
+ if (d->id == dom_id) {
+ ret = mbm_config_write_domain(r, d, evtid, val);
+ if (ret)
+ return -EINVAL;
+ goto next;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret ?: nbytes;
+}
+
+static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret ?: nbytes;
+}
+
/* rdtgroup information files for one cache resource. */
static struct rftype res_common_files[] = {
{
@@ -1518,6 +1774,20 @@ static struct rftype res_common_files[] = {
.fflags = RF_MON_INFO | RFTYPE_RES_CACHE,
},
{
+ .name = "mbm_total_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_total_bytes_config_show,
+ .write = mbm_total_bytes_config_write,
+ },
+ {
+ .name = "mbm_local_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_local_bytes_config_show,
+ .write = mbm_local_bytes_config_write,
+ },
+ {
.name = "cpus",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1623,6 +1893,15 @@ void __init thread_throttle_mode_init(void)
rft->fflags = RF_CTRL_INFO | RFTYPE_RES_MB;
}
+void __init mbm_config_rftype_init(const char *config)
+{
+ struct rftype *rft;
+
+ rft = rdtgroup_get_rftype_by_name(config);
+ if (rft)
+ rft->fflags = RF_MON_INFO | RFTYPE_RES_CACHE;
+}
+
/**
* rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
* @r: The resource group with which the file is associated.
@@ -1864,13 +2143,9 @@ static int set_cache_qos_cfg(int level, bool enable)
/* Pick one CPU from each domain instance to update MSR */
cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
}
- cpu = get_cpu();
- /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- update(&enable);
- /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, update, &enable, 1);
- put_cpu();
+
+ /* Update QOS_CFG MSR on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, update, &enable, 1);
free_cpumask_var(cpu_mask);
@@ -2347,7 +2622,7 @@ static int reset_all_ctrls(struct rdt_resource *r)
struct msr_param msr_param;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- int i, cpu;
+ int i;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
@@ -2368,13 +2643,9 @@ static int reset_all_ctrls(struct rdt_resource *r)
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
}
- cpu = get_cpu();
- /* Update CBM on this cpu if it's in cpu_mask. */
- if (cpumask_test_cpu(cpu, cpu_mask))
- rdt_ctrl_update(&msr_param);
- /* Update CBM on all other cpus in cpu_mask. */
- smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1);
- put_cpu();
+
+ /* Update CBM on all the CPUs in cpu_mask */
+ on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
free_cpumask_var(cpu_mask);
@@ -2402,6 +2673,14 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
WRITE_ONCE(t->rmid, to->mon.rmid);
/*
+ * Order the closid/rmid stores above before the loads
+ * in task_curr(). This pairs with the full barrier
+ * between the rq->curr update and resctrl_sched_in()
+ * during context switch.
+ */
+ smp_mb();
+
+ /*
* If the task is on a CPU, set the CPU in the mask.
* The detection is inaccurate as tasks might move or
* schedule before the smp function call takes place.
@@ -2841,31 +3120,36 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
{
struct resctrl_schema *s;
struct rdt_resource *r;
- int ret;
+ int ret = 0;
+
+ rdt_staged_configs_clear();
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
- if (r->rid == RDT_RESOURCE_MBA) {
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA) {
rdtgroup_init_mba(r, rdtgrp->closid);
if (is_mba_sc(r))
continue;
} else {
ret = rdtgroup_init_cat(s, rdtgrp->closid);
if (ret < 0)
- return ret;
+ goto out;
}
ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Failed to initialize allocations\n");
- return ret;
+ goto out;
}
}
rdtgrp->mode = RDT_MODE_SHAREABLE;
- return 0;
+out:
+ rdt_staged_configs_clear();
+ return ret;
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index fc01f81f6e2a..0dad49a09b7a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -40,10 +40,13 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
{ X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
{ X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
{ X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index aa9b8b868867..262f5fb18d74 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -95,7 +95,7 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma->vm_ops = &sgx_vm_ops;
- vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+ vm_flags_set(vma, VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO);
vma->vm_private_data = encl;
return 0;
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 1ec20807de1e..2a0e90fe2abc 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -160,8 +160,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
return ret;
pginfo.addr = encl_page->desc & PAGE_MASK;
- pginfo.contents = (unsigned long)kmap_atomic(b.contents);
- pcmd_page = kmap_atomic(b.pcmd);
+ pginfo.contents = (unsigned long)kmap_local_page(b.contents);
+ pcmd_page = kmap_local_page(b.pcmd);
pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page)
@@ -187,8 +187,8 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
*/
pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
- kunmap_atomic(pcmd_page);
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local(pcmd_page);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
get_page(b.pcmd);
sgx_encl_put_backing(&b);
@@ -197,10 +197,10 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
- pcmd_page = kmap_atomic(b.pcmd);
+ pcmd_page = kmap_local_page(b.pcmd);
if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
pr_warn("PCMD page not empty after truncate.\n");
- kunmap_atomic(pcmd_page);
+ kunmap_local(pcmd_page);
}
put_page(b.pcmd);
@@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
unsigned long addr,
unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *entry;
entry = xa_load(&encl->page_array, PFN_DOWN(addr));
@@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma)
int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
unsigned long end, unsigned long vm_flags)
{
- unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS;
struct sgx_encl_page *page;
unsigned long count = 0;
int ret = 0;
@@ -680,11 +680,15 @@ const struct vm_operations_struct sgx_vm_ops = {
void sgx_encl_release(struct kref *ref)
{
struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ unsigned long max_page_index = PFN_DOWN(encl->base + encl->size - 1);
struct sgx_va_page *va_page;
struct sgx_encl_page *entry;
- unsigned long index;
+ unsigned long count = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(encl->base));
- xa_for_each(&encl->page_array, index, entry) {
+ xas_lock(&xas);
+ xas_for_each(&xas, entry, max_page_index) {
if (entry->epc_page) {
/*
* The page and its radix tree entry cannot be freed
@@ -699,9 +703,20 @@ void sgx_encl_release(struct kref *ref)
}
kfree(entry);
- /* Invoke scheduler to prevent soft lockups. */
- cond_resched();
+ /*
+ * Invoke scheduler on every XA_CHECK_SCHED iteration
+ * to prevent soft lockups.
+ */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+
+ cond_resched();
+
+ xas_lock(&xas);
+ }
}
+ xas_unlock(&xas);
xa_destroy(&encl->page_array);
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index ebe79d60619f..21ca0a831b70 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -111,7 +111,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
encl->base = secs->base;
encl->size = secs->size;
encl->attributes = secs->attributes;
- encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
+ encl->attributes_mask = SGX_ATTR_UNPRIV_MASK;
/* Set only after completion, as encl->lock has not been taken. */
set_bit(SGX_ENCL_CREATED, &encl->flags);
@@ -221,11 +221,11 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
pginfo.addr = encl_page->desc & PAGE_MASK;
pginfo.metadata = (unsigned long)secinfo;
- pginfo.contents = (unsigned long)kmap_atomic(src_page);
+ pginfo.contents = (unsigned long)kmap_local_page(src_page);
ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
- kunmap_atomic((void *)pginfo.contents);
+ kunmap_local((void *)pginfo.contents);
put_page(src_page);
return ret ? -EIO : 0;
@@ -356,6 +356,9 @@ static int sgx_validate_offset_length(struct sgx_encl *encl,
if (!length || !IS_ALIGNED(length, PAGE_SIZE))
return -EINVAL;
+ if (offset + length < offset)
+ return -EINVAL;
+
if (offset + length - PAGE_SIZE >= encl->size)
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 0aad028f04d4..166692f2d501 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -165,17 +165,17 @@ static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
pginfo.addr = 0;
pginfo.secs = 0;
- pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
- pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
+ pginfo.contents = (unsigned long)kmap_local_page(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_local_page(backing->pcmd) +
backing->pcmd_offset;
ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
set_page_dirty(backing->pcmd);
set_page_dirty(backing->contents);
- kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
+ kunmap_local((void *)(unsigned long)(pginfo.metadata -
backing->pcmd_offset));
- kunmap_atomic((void *)(unsigned long)pginfo.contents);
+ kunmap_local((void *)(unsigned long)pginfo.contents);
return ret;
}
@@ -892,20 +892,19 @@ static struct miscdevice sgx_dev_provision = {
int sgx_set_attribute(unsigned long *allowed_attributes,
unsigned int attribute_fd)
{
- struct file *file;
+ struct fd f = fdget(attribute_fd);
- file = fget(attribute_fd);
- if (!file)
+ if (!f.file)
return -EINVAL;
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
+ if (f.file->f_op != &sgx_provision_fops) {
+ fdput(f);
return -EINVAL;
}
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
- fput(file);
+ fdput(f);
return 0;
}
EXPORT_SYMBOL_GPL(sgx_set_attribute);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 0f2020653fba..d2dad21259a8 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -15,7 +15,7 @@
#define EREMOVE_ERROR_MESSAGE \
"EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
- "Refer to Documentation/x86/sgx.rst for more information."
+ "Refer to Documentation/arch/x86/sgx.rst for more information."
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 6a77a14eee38..c3e37eaec8ec 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -105,7 +105,7 @@ static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_ops = &sgx_vepc_vm_ops;
/* Don't copy VMA in fork() */
- vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vm_flags_set(vma, VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY);
vma->vm_private_data = vepc;
return 0;
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 5e868b62a7c4..0270925fe013 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -79,7 +79,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c)
* initial apic id, which also represents 32-bit extended x2apic id.
*/
c->initial_apicid = edx;
- smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
#endif
return 0;
}
@@ -109,7 +109,8 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
*/
cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
c->initial_apicid = edx;
- core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+ smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index ec7bbac3a9f2..b31ee4f1657a 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -11,6 +11,7 @@
#include <linux/cpufeature.h>
#include <asm/cmdline.h>
+#include <asm/cpu.h>
#include "cpu.h"
@@ -58,24 +59,6 @@ static void tsx_enable(void)
wrmsrl(MSR_IA32_TSX_CTRL, tsx);
}
-static bool tsx_ctrl_is_supported(void)
-{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
- /*
- * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
- * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
- *
- * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
- * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
- * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
- * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
- * tsx= cmdline requests will do nothing on CPUs without
- * MSR_IA32_TSX_CTRL support.
- */
- return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR);
-}
-
static enum tsx_ctrl_states x86_get_tsx_auto_mode(void)
{
if (boot_cpu_has_bug(X86_BUG_TAA))
@@ -135,7 +118,7 @@ static void tsx_clear_cpuid(void)
rdmsrl(MSR_TSX_FORCE_ABORT, msr);
msr |= MSR_TFA_TSX_CPUID_CLEAR;
wrmsrl(MSR_TSX_FORCE_ABORT, msr);
- } else if (tsx_ctrl_is_supported()) {
+ } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) {
rdmsrl(MSR_IA32_TSX_CTRL, msr);
msr |= TSX_CTRL_CPUID_CLEAR;
wrmsrl(MSR_IA32_TSX_CTRL, msr);
@@ -158,7 +141,8 @@ static void tsx_dev_mode_disable(void)
u64 mcu_opt_ctrl;
/* Check if RTM_ALLOW exists */
- if (!boot_cpu_has_bug(X86_BUG_TAA) || !tsx_ctrl_is_supported() ||
+ if (!boot_cpu_has_bug(X86_BUG_TAA) ||
+ !cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL) ||
!cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL))
return;
@@ -191,7 +175,20 @@ void __init tsx_init(void)
return;
}
- if (!tsx_ctrl_is_supported()) {
+ /*
+ * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this
+ * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES.
+ *
+ * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a
+ * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES
+ * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get
+ * MSR_IA32_TSX_CTRL support even after a microcode update. Thus,
+ * tsx= cmdline requests will do nothing on CPUs without
+ * MSR_IA32_TSX_CTRL support.
+ */
+ if (x86_read_arch_cap_msr() & ARCH_CAP_TSX_CTRL_MSR) {
+ setup_force_cpu_cap(X86_FEATURE_MSR_TSX_CTRL);
+ } else {
tsx_ctrl_state = TSX_CTRL_NOT_SUPPORTED;
return;
}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index ec8064c0ae03..2293efd6ffa6 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -232,7 +232,11 @@ static int __init umwait_init(void)
* Add umwait control interface. Ignore failure, so at least the
* default values are set up in case the machine manages to boot.
*/
- dev = cpu_subsys.dev_root;
- return sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ dev = bus_get_dev_root(&cpu_subsys);
+ if (dev) {
+ ret = sysfs_create_group(&dev->kobj, &umwait_attr_group);
+ put_device(dev);
+ }
+ return ret;
}
device_initcall(umwait_init);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 02039ec3597d..11f83d07925e 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -143,7 +143,7 @@ static __init int parse_no_stealacc(char *arg)
}
early_param("no-steal-acc", parse_no_stealacc);
-static unsigned long long notrace vmware_sched_clock(void)
+static noinstr u64 vmware_sched_clock(void)
{
unsigned long long ns;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6f7b8cc1bc9f..bdc0d5539b57 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -139,7 +139,7 @@ static int cpuid_device_destroy(unsigned int cpu)
return 0;
}
-static char *cpuid_devnode(struct device *dev, umode_t *mode)
+static char *cpuid_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
@@ -154,7 +154,7 @@ static int __init cpuid_init(void)
CPUID_MAJOR);
return -EBUSY;
}
- cpuid_class = class_create(THIS_MODULE, "cpuid");
+ cpuid_class = class_create("cpuid");
if (IS_ERR(cpuid_class)) {
err = PTR_ERR(cpuid_class);
goto out_chrdev;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9730c88530fc..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
@@ -401,10 +386,8 @@ int crash_load_segments(struct kimage *image)
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
- if (ret) {
- vfree((void *)image->elf_headers);
+ if (ret)
return ret;
- }
image->elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e75bc2f217ff..32d710f7eb84 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -57,7 +57,7 @@ ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
struct kvec kvec = { .iov_base = buf, .iov_len = count };
struct iov_iter iter;
- iov_iter_kvec(&iter, READ, &kvec, 1, count);
+ iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count);
return read_from_oldmem(&iter, count, ppos,
cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 5cd51f25f446..28da5dd83fc0 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -31,11 +31,6 @@ char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-void __init early_init_dt_add_memory_arch(u64 base, u64 size)
-{
- BUG();
-}
-
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -167,7 +162,14 @@ static void __init dtb_lapic_setup(void)
return;
}
smp_found_config = 1;
- pic_mode = 1;
+ if (of_property_read_bool(dn, "intel,virtual-wire-mode")) {
+ pr_info("Virtual Wire compatibility mode.\n");
+ pic_mode = 0;
+ } else {
+ pr_info("IMCR and PIC compatibility mode.\n");
+ pic_mode = 1;
+ }
+
register_lapic_address(lapic_addr);
}
@@ -248,7 +250,7 @@ static void __init dtb_add_ioapic(struct device_node *dn)
ret = of_address_to_resource(dn, 0, &r);
if (ret) {
- printk(KERN_ERR "Can't obtain address from device node %pOF.\n", dn);
+ pr_err("Can't obtain address from device node %pOF.\n", dn);
return;
}
mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
@@ -265,7 +267,7 @@ static void __init dtb_ioapic_setup(void)
of_ioapic = 1;
return;
}
- printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+ pr_err("Error: No information about IO-APIC in OF.\n");
}
#else
static void __init dtb_ioapic_setup(void) {}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 0bf6779187dd..f18ca44c904b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -195,7 +195,6 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
- stack = stack ? : get_stack_pointer(task, regs);
regs = unwind_get_entry_regs(&state, &partial);
/*
@@ -214,9 +213,13 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - hardirq stack
* - entry stack
*/
- for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
+ for (stack = stack ?: get_stack_pointer(task, regs);
+ stack;
+ stack = stack_info.next_sp) {
const char *stack_name;
+ stack = PTR_ALIGN(stack, sizeof(long));
+
if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
/*
* We weren't on a valid stack. It's possible that
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 722fd712e1cf..b4905d5173fd 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
@@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
+ unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/*
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6c5defd6569a..f05339fee778 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
+ unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr);
unsigned long *begin;
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9dac24680ff8..fb8cf953380d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -53,7 +53,7 @@
*
* Once the E820 map has been converted to the standard Linux memory layout
* information its role stops - modifying it has no effect and does not get
- * re-propagated. So itsmain role is a temporary bootstrap storage of firmware
+ * re-propagated. So its main role is a temporary bootstrap storage of firmware
* specific memory layout data during early bootup.
*/
static struct e820_table e820_table_init __initdata;
@@ -395,7 +395,7 @@ int __init e820__update_table(struct e820_table *table)
/* Continue building up new map based on this information: */
if (current_type != last_type || e820_nomerge(current_type)) {
- if (last_type != 0) {
+ if (last_type) {
new_entries[new_nr_entries].size = change_point[chg_idx]->addr - last_addr;
/* Move forward only if the new size was non-zero: */
if (new_entries[new_nr_entries].size != 0)
@@ -403,7 +403,7 @@ int __init e820__update_table(struct e820_table *table)
if (++new_nr_entries >= max_nr_entries)
break;
}
- if (current_type != 0) {
+ if (current_type) {
new_entries[new_nr_entries].addr = change_point[chg_idx]->addr;
new_entries[new_nr_entries].type = current_type;
last_addr = change_point[chg_idx]->addr;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9417d5aa7305..16f9814c9be0 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -94,17 +94,7 @@ static inline unsigned long espfix_base_addr(unsigned int cpu)
static void init_espfix_random(void)
{
- unsigned long rand;
-
- /*
- * This is run before the entropy pools are initialized,
- * but this is hopefully better than nothing.
- */
- if (!arch_get_random_longs(&rand, 1)) {
- /* The constant is an arbitrary large prime */
- rand = rdtsc();
- rand *= 0xc345c6b72fd16123UL;
- }
+ unsigned long rand = get_random_long();
slot_random = rand % ESPFIX_STACKS_PER_PAGE;
page_random = (rand / ESPFIX_STACKS_PER_PAGE)
diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h
index 958accf2ccf0..af5cbdd9bd29 100644
--- a/arch/x86/kernel/fpu/context.h
+++ b/arch/x86/kernel/fpu/context.h
@@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void)
struct fpu *fpu = &current->thread.fpu;
int cpu = smp_processor_id();
- if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
+ if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER)))
return;
if (!fpregs_state_valid(fpu, cpu)) {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3b28c5b25e12..1015af1ae562 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
{
struct fpstate *kstate = gfpu->fpstate;
const union fpregs_state *ustate = buf;
- struct pkru_state *xpkru;
- int ret;
if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
@@ -406,16 +404,15 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
- ret = copy_uabi_from_kernel_to_xstate(kstate, ustate);
- if (ret)
- return ret;
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
- /* Retrieve PKRU if not in init state */
- if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
- xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
- *vpkru = xpkru->pkru;
- }
- return 0;
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
@@ -429,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
this_cpu_write(in_kernel_fpu, true);
- if (!(current->flags & PF_KTHREAD) &&
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
save_fpregs_to_fpstate(&current->thread.fpu);
@@ -605,9 +602,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
if (test_thread_flag(TIF_NEED_FPU_LOAD))
fpregs_restore_userregs();
save_fpregs_to_fpstate(dst_fpu);
+ fpregs_unlock();
if (!(clone_flags & CLONE_THREAD))
fpu_inherit_perms(dst_fpu);
- fpregs_unlock();
/*
* Children never inherit PASID state.
@@ -856,12 +853,12 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* Initialize register state that may prevent from entering low-power idle.
* This function will be invoked from the cpuidle driver only when needed.
*/
-void fpu_idle_fpregs(void)
+noinstr void fpu_idle_fpregs(void)
{
/* Note: AMX_TILE being enabled implies XGETBV1 support */
if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
(xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
tile_release();
- fpregs_deactivate(&current->thread.fpu);
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 8946f89761cc..851eb13edc01 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -133,9 +133,6 @@ static void __init fpu__init_system_generic(void)
fpu__init_system_mxcsr();
}
-/* Get alignment of the TYPE. */
-#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
-
/*
* Enforce that 'MEMBER' is the last field of 'TYPE'.
*
@@ -143,8 +140,8 @@ static void __init fpu__init_system_generic(void)
* because that's how C aligns structs.
*/
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
- TYPE_ALIGN(TYPE)))
+ BUILD_BUG_ON(sizeof(TYPE) != \
+ ALIGN(offsetofend(TYPE, MEMBER), _Alignof(TYPE)))
/*
* We append the 'struct fpu' to the task_struct:
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 75ffaef8c299..6d056b68f4ed 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -167,7 +167,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
}
fpu_force_restore(fpu);
- ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf);
+ ret = copy_uabi_from_kernel_to_xstate(fpu->fpstate, kbuf ?: tmpbuf, &target->thread.pkru);
out:
vfree(tmpbuf);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 91d4b6de58ab..558076dbde5b 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -396,7 +396,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
fpregs = &fpu->fpstate->regs;
if (use_xsave() && !fx_only) {
- if (copy_sigframe_from_user_to_xstate(fpu->fpstate, buf_fx))
+ if (copy_sigframe_from_user_to_xstate(tsk, buf_fx))
return false;
} else {
if (__copy_from_user(&fpregs->fxsave, buf_fx,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 59e543b95a3c..0bab497c9436 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -440,8 +440,8 @@ static void __init __xstate_dump_leaves(void)
}
}
-#define XSTATE_WARN_ON(x) do { \
- if (WARN_ONCE(x, "XSAVE consistency problem, dumping leaves")) { \
+#define XSTATE_WARN_ON(x, fmt, ...) do { \
+ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \
__xstate_dump_leaves(); \
} \
} while (0)
@@ -554,8 +554,7 @@ static bool __init check_xstate_against_struct(int nr)
(nr >= XFEATURE_MAX) ||
(nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
- WARN_ONCE(1, "no structure for xstate: %d\n", nr);
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
return false;
}
return true;
@@ -598,12 +597,13 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size)
* XSAVES.
*/
if (!xsaves && xfeature_is_supervisor(i)) {
- XSTATE_WARN_ON(1);
+ XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i);
return false;
}
}
size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted);
- XSTATE_WARN_ON(size != kernel_size);
+ XSTATE_WARN_ON(size != kernel_size,
+ "size %u != kernel_size %u\n", size, kernel_size);
return size == kernel_size;
}
@@ -1118,21 +1118,20 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
zerofrom = offsetof(struct xregs_state, extended_state_area);
/*
- * The ptrace buffer is in non-compacted XSAVE format. In
- * non-compacted format disabled features still occupy state space,
- * but there is no state to copy from in the compacted
- * init_fpstate. The gap tracking will zero these states.
- */
- mask = fpstate->user_xfeatures;
-
- /*
- * Dynamic features are not present in init_fpstate. When they are
- * in an all zeros init state, remove those from 'mask' to zero
- * those features in the user buffer instead of retrieving them
- * from init_fpstate.
+ * This 'mask' indicates which states to copy from fpstate.
+ * Those extended states that are not present in fpstate are
+ * either disabled or initialized:
+ *
+ * In non-compacted format, disabled features still occupy
+ * state space but there is no state to copy from in the
+ * compacted init_fpstate. The gap tracking will zero these
+ * states.
+ *
+ * The extended features have an all zeroes init state. Thus,
+ * remove them from 'mask' to zero those features in the user
+ * buffer instead of retrieving them from init_fpstate.
*/
- if (fpu_state_size_dynamic())
- mask &= (header.xfeatures | xinit->header.xcomp_bv);
+ mask = header.xfeatures;
for_each_extended_xfeature(i, mask) {
/*
@@ -1151,9 +1150,8 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
pkru.pkru = pkru_val;
membuf_write(&to, &pkru, sizeof(pkru));
} else {
- copy_feature(header.xfeatures & BIT_ULL(i), &to,
+ membuf_write(&to,
__raw_xsave_addr(xsave, i),
- __raw_xsave_addr(xinit, i),
xstate_sizes[i]);
}
/*
@@ -1200,8 +1198,36 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
}
+/**
+ * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate
+ * @fpstate: The fpstate buffer to copy to
+ * @kbuf: The UABI format buffer, if it comes from the kernel
+ * @ubuf: The UABI format buffer, if it comes from userspace
+ * @pkru: The location to write the PKRU value to
+ *
+ * Converts from the UABI format into the kernel internal hardware
+ * dependent format.
+ *
+ * This function ultimately has three different callers with distinct PKRU
+ * behavior.
+ * 1. When called from sigreturn the PKRU register will be restored from
+ * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
+ * @fpstate is sufficient to cover this case, but the caller will also
+ * pass a pointer to the thread_struct's pkru field in @pkru and updating
+ * it is harmless.
+ * 2. When called from ptrace the PKRU register will be restored from the
+ * thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * The kernel will restore it manually, so the XRSTOR behavior that resets
+ * the PKRU register to the hardware init value (0) if the corresponding
+ * xfeatures bit is not set is emulated here.
+ * 3. When called from KVM the PKRU register will be restored from the vcpu's
+ * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
+ * XRSTOR and hasn't had the PKRU resetting behavior described above. To
+ * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
+ * bit is not set.
+ */
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
- const void __user *ubuf)
+ const void __user *ubuf, u32 *pkru)
{
struct xregs_state *xsave = &fpstate->regs.xsave;
unsigned int offset, size;
@@ -1250,6 +1276,20 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
}
}
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ } else {
+ /*
+ * KVM may pass NULL here to indicate that it does not need
+ * PKRU updated.
+ */
+ if (pkru)
+ *pkru = 0;
+ }
+
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1268,9 +1308,9 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
* Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S]
* format and copy to the target thread. Used by ptrace and KVM.
*/
-int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
+int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru)
{
- return copy_uabi_to_xstate(fpstate, kbuf, NULL);
+ return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru);
}
/*
@@ -1278,10 +1318,10 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf)
* XSAVE[S] format and copy to the target thread. This is called from the
* sigreturn() and rt_sigreturn() system calls.
*/
-int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate,
+int copy_sigframe_from_user_to_xstate(struct task_struct *tsk,
const void __user *ubuf)
{
- return copy_uabi_to_xstate(fpstate, NULL, ubuf);
+ return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru);
}
static bool validate_independent_components(u64 mask)
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 5ad47031383b..a4ecb04d8d64 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -46,8 +46,8 @@ extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
u32 pkru_val, enum xstate_copy_mode copy_mode);
extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk,
enum xstate_copy_mode mode);
-extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf);
-extern int copy_sigframe_from_user_to_xstate(struct fpstate *fpstate, const void __user *ubuf);
+extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru);
+extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf);
extern void fpu__init_cpu_xstate(void);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bd165004776d..5e7ead52cfdb 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,10 +24,10 @@
#include <linux/module.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
#include <trace/syscall.h>
-#include <asm/set_memory.h>
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
@@ -69,6 +69,10 @@ static const char *ftrace_nop_replace(void)
static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting itself.
+ */
return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
@@ -217,7 +221,9 @@ void ftrace_replace_code(int enable)
ret = ftrace_verify_code(rec->ip, old);
if (ret) {
+ ftrace_expected = old;
ftrace_bug(ret, rec);
+ ftrace_expected = NULL;
return;
}
}
@@ -317,7 +323,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long size;
unsigned long *ptr;
void *trampoline;
- void *ip;
+ void *ip, *dest;
/* 48 8b 15 <offset> is movq <offset>(%rip), %rdx */
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
@@ -359,7 +365,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + size;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, &__x86_return_thunk, JMP32_INSN_SIZE);
+ __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
memcpy(ip, retq, sizeof(retq));
@@ -404,20 +410,20 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* put in the call to the function */
mutex_lock(&text_mutex);
call_offset -= start_offset;
+ /*
+ * No need to translate into a callthunk. The trampoline does
+ * the depth accounting before the call already.
+ */
+ dest = ftrace_ops_get_func(ops);
memcpy(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE,
- trampoline + call_offset,
- ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
- set_vm_flush_reset_perms(trampoline);
-
- if (likely(system_state != SYSTEM_BOOTING))
- set_memory_ro((unsigned long)trampoline, npages);
- set_memory_x((unsigned long)trampoline, npages);
+ set_memory_rox((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
tramp_free(trampoline);
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index a0ed0e4a2c0c..0d9a14528176 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -163,6 +163,11 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
jmp .Lftrace_ret
SYM_CODE_END(ftrace_regs_caller)
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_CODE_START(ftrace_graph_caller)
pushl %eax
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 2a4be92fd144..b8c720b5dab2 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -3,8 +3,9 @@
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
-#include <linux/linkage.h>
#include <linux/cfi_types.h>
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
#include <asm/ptrace.h>
#include <asm/ftrace.h>
#include <asm/export.h>
@@ -131,16 +132,21 @@
.endm
SYM_TYPED_FUNC_START(ftrace_stub)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_TYPED_FUNC_START(ftrace_stub_graph)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(ftrace_stub_graph)
+#endif
#ifdef CONFIG_DYNAMIC_FTRACE
SYM_FUNC_START(__fentry__)
+ CALL_DEPTH_ACCOUNT
RET
SYM_FUNC_END(__fentry__)
EXPORT_SYMBOL(__fentry__)
@@ -149,6 +155,8 @@ SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ CALL_DEPTH_ACCOUNT
+
/* Stack - skipping return address of ftrace_caller */
leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
movq %rcx, RSP(%rsp)
@@ -164,6 +172,9 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Only ops with REGS flag set should have CS register set */
movq $0, CS(%rsp)
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
call ftrace_stub
@@ -193,6 +204,8 @@ SYM_FUNC_START(ftrace_regs_caller)
save_mcount_regs 8
/* save_mcount_regs fills in first two parameters */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Load the ftrace_ops into the 3rd parameter */
@@ -223,6 +236,9 @@ SYM_INNER_LABEL(ftrace_regs_caller_op_ptr, SYM_L_GLOBAL)
/* regs go into 4th parameter */
leaq (%rsp), %rcx
+ /* Account for the function call below */
+ CALL_DEPTH_ACCOUNT
+
SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
call ftrace_stub
@@ -275,15 +291,34 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
/* Restore flags */
popfq
UNWIND_HINT_FUNC
- RET
+
+ /*
+ * The above left an extra return value on the stack; effectively
+ * doing a tail-call without using a register. This PUSH;RET
+ * pattern unbalances the RSB, inject a pointless CALL to rebalance.
+ */
+ ANNOTATE_INTRA_FUNCTION_CALL
+ CALL .Ldo_rebalance
+ int3
+.Ldo_rebalance:
+ add $8, %rsp
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
SYM_FUNC_END(ftrace_regs_caller)
STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller)
+SYM_FUNC_START(ftrace_stub_direct_tramp)
+ CALL_DEPTH_ACCOUNT
+ RET
+SYM_FUNC_END(ftrace_stub_direct_tramp)
#else /* ! CONFIG_DYNAMIC_FTRACE */
SYM_FUNC_START(__fentry__)
+ CALL_DEPTH_ACCOUNT
+
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
RET
@@ -311,7 +346,7 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_CODE_START(return_to_handler)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
subq $16, %rsp
@@ -337,6 +372,8 @@ SYM_CODE_START(return_to_handler)
int3
.Ldo_rop:
mov %rdi, (%rsp)
- RET
+ ALTERNATIVE __stringify(RET), \
+ __stringify(ANNOTATE_UNRET_SAFE; ret; int3), \
+ X86_FEATURE_CALL_DEPTH
SYM_CODE_END(return_to_handler)
#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index ec6fefbfd3c0..10c27b4261eb 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,7 +29,7 @@ static void __init i386_default_early_setup(void)
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
}
-asmlinkage __visible void __init i386_start_kernel(void)
+asmlinkage __visible void __init __noreturn i386_start_kernel(void)
{
/* Make sure IDT is set up before any exception happens */
idt_setup_early_handler();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6a3cfaf6b72a..49f7629b17f7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -203,7 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
/* Is the address not 2M aligned? */
- if (load_delta & ~PMD_PAGE_MASK)
+ if (load_delta & ~PMD_MASK)
for (;;);
/* Include the SME encryption mask in the fixup value */
@@ -471,7 +471,7 @@ static void __init copy_bootdata(char *real_mode_data)
sme_unmap_bootdata(real_mode_data);
}
-asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode_data)
{
/*
* Build-time sanity checks on the kernel image and module
@@ -537,7 +537,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
x86_64_start_reservations(real_mode_data);
}
-void __init x86_64_start_reservations(char *real_mode_data)
+void __init __noreturn x86_64_start_reservations(char *real_mode_data)
{
/* version is always not zero if it is copied */
if (!boot_params.hdr.version)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 9b7acc9c7874..67c8ed99144b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -261,16 +261,6 @@ SYM_FUNC_START(startup_32_smp)
addl $__PAGE_OFFSET, %esp
/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
- movl setup_once_ref,%eax
- andl %eax,%eax
- jz 1f # Did we do this already?
- call *%eax
-1:
-
-/*
* Check if it is 486
*/
movb $4,X86 # at least 486
@@ -331,18 +321,7 @@ SYM_FUNC_END(startup_32_smp)
#include "verify_cpu.S"
-/*
- * setup_once
- *
- * The setup work we only want to run on the BSP.
- *
- * Warning: %esi is live across this function.
- */
__INIT
-setup_once:
- andl $0,setup_once_ref /* Once is enough, thanks */
- RET
-
SYM_FUNC_START(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
@@ -458,7 +437,6 @@ SYM_DATA(early_recursion_flag, .long 0)
__REFDATA
.align 4
SYM_DATA(initial_code, .long i386_start_kernel)
-SYM_DATA(setup_once_ref, .long setup_once)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index d860d437631b..113c13376e51 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -42,7 +42,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@@ -61,23 +61,15 @@ SYM_CODE_START_NOALIGN(startup_64)
* tables and then reload them.
*/
- /* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
+ /* Set up the stack for verify_cpu() */
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
- /*
- * initial_gs points to initial fixed_percpu_data struct with storage for
- * the stack protector canary. Global pointer fixups are needed at this
- * stage, so apply them as is done in fixup_pointer(), and initialize %gs
- * such that the canary can be accessed at %gs:40 for subsequent C calls.
- */
+ /* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
- movq initial_gs(%rip), %rax
- movq $_text, %rdx
- subq %rdx, %rax
- addq %rdi, %rax
- movq %rax, %rdx
+ leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
+ movl %edx, %eax
shrq $32, %rdx
wrmsr
@@ -85,6 +77,15 @@ SYM_CODE_START_NOALIGN(startup_64)
call startup_64_setup_env
popq %rsi
+ /* Now switch to __KERNEL_CS so IRET works reliably */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
+
+.Lon_kernel_cs:
+ UNWIND_HINT_END_OF_STACK
+
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* Activate SEV/SME memory encryption if supported/enabled. This needs to
@@ -98,15 +99,6 @@ SYM_CODE_START_NOALIGN(startup_64)
popq %rsi
#endif
- /* Now switch to __KERNEL_CS so IRET works reliably */
- pushq $__KERNEL_CS
- leaq .Lon_kernel_cs(%rip), %rax
- pushq %rax
- lretq
-
-.Lon_kernel_cs:
- UNWIND_HINT_EMPTY
-
/* Sanitize CPU configuration */
call verify_cpu
@@ -127,7 +119,7 @@ SYM_CODE_START_NOALIGN(startup_64)
SYM_CODE_END(startup_64)
SYM_CODE_START(secondary_startup_64)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -156,7 +148,7 @@ SYM_CODE_START(secondary_startup_64)
* verify_cpu() above to make sure NX is enabled.
*/
SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
@@ -238,16 +230,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
1:
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // above
+#ifdef CONFIG_SMP
+ movl smpboot_control(%rip), %ecx
+
+ /* Get the per cpu offset for the given CPU# which is in ECX */
+ movq __per_cpu_offset(,%rcx,8), %rdx
+#else
+ xorl %edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack.
+ *
+ * RDX contains the per-cpu offset
+ */
+ movq pcpu_hot + X86_current_task(%rdx), %rax
+ movq TASK_threadsp(%rax), %rsp
+
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt early_gdt_descr(%rip)
+ subq $16, %rsp
+ movw $(GDT_SIZE-1), (%rsp)
+ leaq gdt_page(%rdx), %rax
+ movq %rax, 2(%rsp)
+ lgdt (%rsp)
+ addq $16, %rsp
/* set up data segments */
xorl %eax,%eax
@@ -271,16 +286,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movl initial_gs(%rip),%eax
- movl initial_gs+4(%rip),%edx
+#ifndef CONFIG_SMP
+ leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
+#endif
+ movl %edx, %eax
+ shrq $32, %rdx
wrmsr
- /*
- * Setup a boot time stack - Any secondary CPU will have lost its stack
- * by now because the cr3-switch above unmaps the real-mode stack
- */
- movq initial_stack(%rip), %rsp
-
/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
@@ -370,8 +382,13 @@ SYM_CODE_END(secondary_startup_64)
* start_secondary() via .Ljump_to_C_code.
*/
SYM_CODE_START(start_cpu0)
- UNWIND_HINT_EMPTY
- movq initial_stack(%rip), %rsp
+ ANNOTATE_NOENDBR
+ UNWIND_HINT_END_OF_STACK
+
+ /* Find the idle task stack */
+ movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq TASK_threadsp(%rcx), %rsp
+
jmp .Ljump_to_C_code
SYM_CODE_END(start_cpu0)
#endif
@@ -389,8 +406,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
UNWIND_HINT_IRET_REGS offset=8
ENDBR
- ANNOTATE_UNRET_END
-
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
@@ -415,16 +430,9 @@ SYM_CODE_END(vc_boot_ghcb)
__REFDATA
.balign 8
SYM_DATA(initial_code, .quad x86_64_start_kernel)
-SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA
__INIT
@@ -450,7 +458,6 @@ SYM_CODE_END(early_idt_handler_array)
SYM_CODE_START_LOCAL(early_idt_handler_common)
UNWIND_HINT_IRET_REGS offset=16
- ANNOTATE_UNRET_END
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -500,8 +507,6 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
UNWIND_HINT_IRET_REGS offset=8
ENDBR
- ANNOTATE_UNRET_END
-
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
@@ -656,8 +661,7 @@ SYM_DATA_END(level1_fixmap_pgt)
.data
.align 16
-SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
-SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
+SYM_DATA(smpboot_control, .long 0)
.align 16
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 71f336425e58..c8eb1ac5125a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 668a4a6533d9..b01644c949b2 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -127,7 +127,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
set_debugreg(*dr7, 7);
if (info->mask)
- set_dr_addr_mask(info->mask, i);
+ amd_set_dr_addr_mask(info->mask, i);
return 0;
}
@@ -166,7 +166,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
set_debugreg(dr7, 7);
if (info->mask)
- set_dr_addr_mask(0, i);
+ amd_set_dr_addr_mask(0, i);
/*
* Ensure the write to cpu_dr7 is after we've set the DR7 register.
@@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
/* CPU entry erea is always used for CPU entry */
if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
- CPU_ENTRY_AREA_TOTAL_SIZE))
+ CPU_ENTRY_AREA_MAP_SIZE))
return true;
/*
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 15aefa3f3e18..4d8aff05a509 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -114,6 +114,7 @@ static void make_8259A_irq(unsigned int irq)
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
+ irq_set_status_flags(irq, IRQ_LEVEL);
enable_irq(irq);
lapic_assign_legacy_vector(irq, true);
}
@@ -407,7 +408,7 @@ struct legacy_pic null_legacy_pic = {
.make_irq = legacy_pic_uint_noop,
};
-struct legacy_pic default_legacy_pic = {
+static struct legacy_pic default_legacy_pic = {
.nr_legacy_irqs = NR_IRQS_LEGACY,
.chip = &i8259A_chip,
.mask = mask_8259A_irq,
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 01833ebf5e8e..dc1049c01f9b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -52,9 +52,6 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
-DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
-
static void call_on_stack(void *func, void *stack)
{
asm volatile("xchgl %%ebx,%%esp \n"
@@ -77,7 +74,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
u32 *isp, *prev_esp, arg1;
curstk = (struct irq_stack *) current_stack();
- irqstk = __this_cpu_read(hardirq_stack_ptr);
+ irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -115,7 +112,7 @@ int irq_init_percpu_irqstack(unsigned int cpu)
int node = cpu_to_node(cpu);
struct page *ph, *ps;
- if (per_cpu(hardirq_stack_ptr, cpu))
+ if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
return 0;
ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
@@ -127,8 +124,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return -ENOMEM;
}
- per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
- per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph);
+ per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps);
return 0;
}
@@ -138,7 +135,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk;
u32 *isp, *prev_esp;
- irqstk = __this_cpu_read(softirq_stack_ptr);
+ irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr);
/* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1c0fb96b9e39..fe0c859873d1 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -50,7 +50,7 @@ static int map_irq_stack(unsigned int cpu)
return -ENOMEM;
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -63,14 +63,14 @@ static int map_irq_stack(unsigned int cpu)
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
/* Store actual TOS to avoid adjustment in the hotpath */
- per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
+ per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
int irq_init_percpu_irqstack(unsigned int cpu)
{
- if (per_cpu(hardirq_stack_ptr, cpu))
+ if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu))
return 0;
return map_irq_stack(cpu);
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index beb1bada1b0a..c683666876f1 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -65,8 +65,10 @@ void __init init_ISA_irqs(void)
legacy_pic->init(0);
- for (i = 0; i < nr_legacy_irqs(); i++)
+ for (i = 0; i < nr_legacy_irqs(); i++) {
irq_set_chip_and_handler(i, chip, handle_level_irq);
+ irq_set_status_flags(i, IRQ_LEVEL);
+ }
}
void __init init_IRQ(void)
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9ff480e94511..670eb08b972a 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -77,15 +77,6 @@ static struct ctl_table itmt_kern_table[] = {
{}
};
-static struct ctl_table itmt_root_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = itmt_kern_table,
- },
- {}
-};
-
static struct ctl_table_header *itmt_sysctl_header;
/**
@@ -114,7 +105,7 @@ int sched_set_itmt_support(void)
return 0;
}
- itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
if (!itmt_sysctl_header) {
mutex_unlock(&itmt_update_mutex);
return -ENOMEM;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 6b58610a1552..a61c12c01270 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -476,7 +476,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
efi_map_offset = params_cmdline_sz;
efi_setup_data_offset = efi_map_offset + ALIGN(efi_map_sz, 16);
- /* Copy setup header onto bootparams. Documentation/x86/boot.rst */
+ /* Copy setup header onto bootparams. Documentation/arch/x86/boot.rst */
setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
/* Is there a limit on setup header size? */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index eb8bc82846b9..f7f6042eb7e6 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -37,12 +37,14 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/objtool.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
+#include <linux/set_memory.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -51,7 +53,6 @@
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/debugreg.h>
-#include <asm/set_memory.h>
#include <asm/ibt.h>
#include "common.h"
@@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr)
if (ret < 0)
return 0;
+#ifdef CONFIG_KGDB
/*
- * Another debugging subsystem might insert this breakpoint.
- * In that case, we can't recover it.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
return 0;
+#endif
addr += insn.length;
}
@@ -414,18 +418,11 @@ void *alloc_insn_page(void)
if (!page)
return NULL;
- set_vm_flush_reset_perms(page);
- /*
- * First make the page read-only, and only then make it executable to
- * prevent it from being W+X in between.
- */
- set_memory_ro((unsigned long)page, 1);
-
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
- set_memory_x((unsigned long)page, 1);
+ set_memory_rox((unsigned long)page, 1);
return page;
}
@@ -467,50 +464,26 @@ static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(kprobe_emulate_call);
-static nokprobe_inline
-void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
{
unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
- if (cond)
- ip += p->ainsn.rel32;
+ ip += p->ainsn.rel32;
int3_emulate_jmp(regs, ip);
}
-
-static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
-{
- __kprobe_emulate_jmp(p, regs, true);
-}
NOKPROBE_SYMBOL(kprobe_emulate_jmp);
-static const unsigned long jcc_mask[6] = {
- [0] = X86_EFLAGS_OF,
- [1] = X86_EFLAGS_CF,
- [2] = X86_EFLAGS_ZF,
- [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
- [4] = X86_EFLAGS_SF,
- [5] = X86_EFLAGS_PF,
-};
-
static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
{
- bool invert = p->ainsn.jcc.type & 1;
- bool match;
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
- if (p->ainsn.jcc.type < 0xc) {
- match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
- } else {
- match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
- ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
- if (p->ainsn.jcc.type >= 0xe)
- match = match || (regs->flags & X86_EFLAGS_ZF);
- }
- __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+ int3_emulate_jcc(regs, p->ainsn.jcc.type, ip, p->ainsn.rel32);
}
NOKPROBE_SYMBOL(kprobe_emulate_jcc);
static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
bool match;
if (p->ainsn.loop.type != 3) { /* LOOP* */
@@ -538,7 +511,9 @@ static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
else if (p->ainsn.loop.type == 1) /* LOOPE */
match = match && (regs->flags & X86_EFLAGS_ZF);
- __kprobe_emulate_jmp(p, regs, match);
+ if (match)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
}
NOKPROBE_SYMBOL(kprobe_emulate_loop);
@@ -628,7 +603,7 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
/* 1 byte conditional jump */
p->ainsn.emulate_op = kprobe_emulate_jcc;
p->ainsn.jcc.type = opcode & 0xf;
- p->ainsn.rel32 = *(char *)insn->immediate.bytes;
+ p->ainsn.rel32 = insn->immediate.value;
break;
case 0x0f:
opcode = insn->opcode.bytes[1];
@@ -662,17 +637,19 @@ static int prepare_emulation(struct kprobe *p, struct insn *insn)
* is determined by the MOD/RM byte.
*/
opcode = insn->modrm.bytes[0];
- if ((opcode & 0x30) == 0x10) {
- if ((opcode & 0x8) == 0x8)
- return -EOPNOTSUPP; /* far call */
- /* call absolute, indirect */
+ switch (X86_MODRM_REG(opcode)) {
+ case 0b010: /* FF /2, call near, absolute indirect */
p->ainsn.emulate_op = kprobe_emulate_call_indirect;
- } else if ((opcode & 0x30) == 0x20) {
- if ((opcode & 0x8) == 0x8)
- return -EOPNOTSUPP; /* far jmp */
- /* jmp near absolute indirect */
+ break;
+ case 0b100: /* FF /4, jmp near, absolute indirect */
p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
- } else
+ break;
+ case 0b011: /* FF /3, call far, absolute indirect */
+ case 0b101: /* FF /5, jmp far, absolute indirect */
+ return -EOPNOTSUPP;
+ }
+
+ if (!p->ainsn.emulate_op)
break;
if (insn->addr_bytes != sizeof(unsigned long))
@@ -993,20 +970,6 @@ int kprobe_int3_handler(struct pt_regs *regs)
kprobe_post_process(p, regs, kcb);
return 1;
}
- }
-
- if (*addr != INT3_INSN_OPCODE) {
- /*
- * The breakpoint instruction was removed right
- * after we hit it. Another cpu has removed
- * either a probepoint or a debugger breakpoint
- * at this address. In either case, no further
- * handling of this interrupt is appropriate.
- * Back up over the (now missing) int3 and run
- * the original instruction.
- */
- regs->ip = (unsigned long)addr;
- return 1;
} /* else: not a kprobe fault; let the kernel handle it */
return 0;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index e6b8c5362b94..57b0037d0a99 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -15,6 +15,7 @@
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
+#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
@@ -45,8 +46,8 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
op = container_of(kp, struct optimized_kprobe, kp);
- /* If op->list is not empty, op is under optimizing */
- if (list_empty(&op->list))
+ /* If op is optimized or under unoptimizing */
+ if (list_empty(&op->list) || optprobe_queued_unopt(op))
goto found;
}
}
@@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn)
return ret;
}
-static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
-{
- unsigned char ops;
-
- for (; addr < eaddr; addr++) {
- if (get_kernel_nofault(ops, (void *)addr) < 0 ||
- ops != INT3_INSN_OPCODE)
- return false;
- }
-
- return true;
-}
-
/* Decode whole function to ensure any instructions don't jump into target */
static int can_optimize(unsigned long paddr)
{
@@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr)
ret = insn_decode_kernel(&insn, (void *)recovered_insn);
if (ret < 0)
return 0;
-
+#ifdef CONFIG_KGDB
/*
- * In the case of detecting unknown breakpoint, this could be
- * a padding INT3 between functions. Let's check that all the
- * rest of the bytes are also INT3.
+ * If there is a dynamically installed kgdb sw breakpoint,
+ * this function should not be probed.
*/
- if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
- return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
-
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
+ kgdb_has_hit_break(addr))
+ return 0;
+#endif
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
@@ -365,7 +353,7 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op)
for (i = 1; i < op->optinsn.size; i++) {
p = get_kprobe(op->kp.addr + i);
- if (p && !kprobe_disabled(p))
+ if (p && !kprobe_disarmed(p))
return -EEXIST;
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d4e48b4a438b..1cceac5984da 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -349,7 +349,7 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
- u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
+ u64 pa;
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
@@ -798,19 +798,13 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
* restoring to/from the stack.
*/
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
+#define PV_VCPU_PREEMPTED_ASM \
+ "movq __per_cpu_offset(,%rdi,8), %rax\n\t" \
+ "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
+ "setne %al\n\t"
+DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 16333ba1904b..0f35d44c56fe 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,12 +71,12 @@ static int kvm_set_wallclock(const struct timespec64 *now)
return -ENODEV;
}
-static u64 kvm_clock_read(void)
+static noinstr u64 kvm_clock_read(void)
{
u64 ret;
preempt_disable_notrace();
- ret = pvclock_clocksource_read(this_cpu_pvti());
+ ret = pvclock_clocksource_read_nowd(this_cpu_pvti());
preempt_enable_notrace();
return ret;
}
@@ -86,7 +86,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
return kvm_clock_read();
}
-static u64 kvm_sched_clock_read(void)
+static noinstr u64 kvm_sched_clock_read(void)
{
return kvm_clock_read() - kvm_sched_clock_offset;
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 0611fd83858e..1a3e2c05a8a5 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -374,17 +374,6 @@ void machine_kexec(struct kimage *image)
/* arch-dependent functionality related to kexec file-based syscall */
#ifdef CONFIG_KEXEC_FILE
-void *arch_kexec_kernel_image_load(struct kimage *image)
-{
- if (!image->fops || !image->fops->load)
- return ERR_PTR(-ENOEXEC);
-
- return image->fops->load(image, image->kernel_buf,
- image->kernel_buf_len, image->initrd_buf,
- image->initrd_buf_len, image->cmdline_buf,
- image->cmdline_buf_len);
-}
-
/*
* Apply purgatory relocations.
*
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index c032edcd3d95..b05f62ee2344 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -53,7 +53,7 @@ static unsigned long int get_module_load_offset(void)
*/
if (module_load_offset == 0)
module_load_offset =
- (prandom_u32_max(1024) + 1) * PAGE_SIZE;
+ get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
mutex_unlock(&module_kaslr_mutex);
}
return module_load_offset;
@@ -74,10 +74,11 @@ void *module_alloc(unsigned long size)
return NULL;
p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask,
- PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE,
- __builtin_return_address(0));
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, gfp_mask, PAGE_KERNEL,
+ VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
+ NUMA_NO_NODE, __builtin_return_address(0));
+
if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
vfree(p);
return NULL;
@@ -128,22 +129,27 @@ int apply_relocate(Elf32_Shdr *sechdrs,
return 0;
}
#else /*X86_64*/
-static int __apply_relocate_add(Elf64_Shdr *sechdrs,
+static int __write_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me,
- void *(*write)(void *dest, const void *src, size_t len))
+ void *(*write)(void *dest, const void *src, size_t len),
+ bool apply)
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
Elf64_Sym *sym;
void *loc;
u64 val;
+ u64 zero = 0ULL;
- DEBUGP("Applying relocate section %u to %u\n",
+ DEBUGP("%s relocate section %u to %u\n",
+ apply ? "Applying" : "Clearing",
relsec, sechdrs[relsec].sh_info);
for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+ size_t size;
+
/* This is where to make the change */
loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
+ rel[i].r_offset;
@@ -161,56 +167,53 @@ static int __apply_relocate_add(Elf64_Shdr *sechdrs,
switch (ELF64_R_TYPE(rel[i].r_info)) {
case R_X86_64_NONE:
- break;
+ continue; /* nothing to write */
case R_X86_64_64:
- if (*(u64 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 8);
+ size = 8;
break;
case R_X86_64_32:
- if (*(u32 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 4);
- if (val != *(u32 *)loc)
+ if (val != *(u32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_32S:
- if (*(s32 *)loc != 0)
- goto invalid_relocation;
- write(loc, &val, 4);
- if ((s64)val != *(s32 *)loc)
+ if ((s64)val != *(s32 *)&val)
goto overflow;
+ size = 4;
break;
case R_X86_64_PC32:
case R_X86_64_PLT32:
- if (*(u32 *)loc != 0)
- goto invalid_relocation;
val -= (u64)loc;
- write(loc, &val, 4);
-#if 0
- if ((s64)val != *(s32 *)loc)
- goto overflow;
-#endif
+ size = 4;
break;
case R_X86_64_PC64:
- if (*(u64 *)loc != 0)
- goto invalid_relocation;
val -= (u64)loc;
- write(loc, &val, 8);
+ size = 8;
break;
default:
pr_err("%s: Unknown rela relocation: %llu\n",
me->name, ELF64_R_TYPE(rel[i].r_info));
return -ENOEXEC;
}
+
+ if (apply) {
+ if (memcmp(loc, &zero, size)) {
+ pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &val, size);
+ } else {
+ if (memcmp(loc, &val, size)) {
+ pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
+ return -ENOEXEC;
+ }
+ write(loc, &zero, size);
+ }
}
return 0;
-invalid_relocation:
- pr_err("x86/modules: Skipping invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
- (int)ELF64_R_TYPE(rel[i].r_info), loc, val);
- return -ENOEXEC;
-
overflow:
pr_err("overflow in relocation type %d val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), val);
@@ -219,11 +222,12 @@ overflow:
return -ENOEXEC;
}
-int apply_relocate_add(Elf64_Shdr *sechdrs,
- const char *strtab,
- unsigned int symindex,
- unsigned int relsec,
- struct module *me)
+static int write_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me,
+ bool apply)
{
int ret;
bool early = me->state == MODULE_STATE_UNFORMED;
@@ -234,8 +238,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
mutex_lock(&text_mutex);
}
- ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
- write);
+ ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me,
+ write, apply);
if (!early) {
text_poke_sync();
@@ -245,20 +249,39 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
return ret;
}
+int apply_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ return write_relocate_add(sechdrs, strtab, symindex, relsec, me, true);
+}
+
+#ifdef CONFIG_LIVEPATCH
+void clear_relocate_add(Elf64_Shdr *sechdrs,
+ const char *strtab,
+ unsigned int symindex,
+ unsigned int relsec,
+ struct module *me)
+{
+ write_relocate_add(sechdrs, strtab, symindex, relsec, me, false);
+}
+#endif
+
#endif
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ const Elf_Shdr *s, *alt = NULL, *locks = NULL,
*para = NULL, *orc = NULL, *orc_ip = NULL,
- *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL;
+ *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
+ *calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- if (!strcmp(".text", secstrings + s->sh_name))
- text = s;
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
@@ -273,6 +296,10 @@ int module_finalize(const Elf_Ehdr *hdr,
retpolines = s;
if (!strcmp(".return_sites", secstrings + s->sh_name))
returns = s;
+ if (!strcmp(".call_sites", secstrings + s->sh_name))
+ calls = s;
+ if (!strcmp(".cfi_sites", secstrings + s->sh_name))
+ cfi = s;
if (!strcmp(".ibt_endbr_seal", secstrings + s->sh_name))
ibt_endbr = s;
}
@@ -285,6 +312,22 @@ int module_finalize(const Elf_Ehdr *hdr,
void *pseg = (void *)para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
+ if (retpolines || cfi) {
+ void *rseg = NULL, *cseg = NULL;
+ unsigned int rsize = 0, csize = 0;
+
+ if (retpolines) {
+ rseg = (void *)retpolines->sh_addr;
+ rsize = retpolines->sh_size;
+ }
+
+ if (cfi) {
+ cseg = (void *)cfi->sh_addr;
+ csize = cfi->sh_size;
+ }
+
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ }
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
apply_retpolines(rseg, rseg + retpolines->sh_size);
@@ -298,16 +341,32 @@ int module_finalize(const Elf_Ehdr *hdr,
void *aseg = (void *)alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
+ if (calls || para) {
+ struct callthunk_sites cs = {};
+
+ if (calls) {
+ cs.call_start = (void *)calls->sh_addr;
+ cs.call_end = (void *)calls->sh_addr + calls->sh_size;
+ }
+
+ if (para) {
+ cs.pv_start = (void *)para->sh_addr;
+ cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ }
+
+ callthunks_patch_module_calls(&cs, me);
+ }
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size);
}
- if (locks && text) {
+ if (locks) {
void *lseg = (void *)locks->sh_addr;
- void *tseg = (void *)text->sh_addr;
+ void *text = me->mem[MOD_TEXT].base;
+ void *text_end = text + me->mem[MOD_TEXT].size;
alternatives_smp_module_add(me, me->name,
lseg, lseg + locks->sh_size,
- tseg, tseg + text->sh_size);
+ text, text_end);
}
if (orc && orc_ip)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ed8ac6bcbafb..7bb17d37db01 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -250,7 +250,7 @@ static int msr_device_destroy(unsigned int cpu)
return 0;
}
-static char *msr_devnode(struct device *dev, umode_t *mode)
+static char *msr_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
@@ -263,7 +263,7 @@ static int __init msr_init(void)
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
return -EBUSY;
}
- msr_class = class_create(THIS_MODULE, "msr");
+ msr_class = class_create("msr");
if (IS_ERR(msr_class)) {
err = PTR_ERR(msr_class);
goto out_chrdev;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index cec0bfa3bc04..776f4b1e395b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -69,6 +69,15 @@ struct nmi_stats {
unsigned int unknown;
unsigned int external;
unsigned int swallow;
+ unsigned long recv_jiffies;
+ unsigned long idt_seq;
+ unsigned long idt_nmi_seq;
+ unsigned long idt_ignored;
+ atomic_long_t idt_calls;
+ unsigned long idt_seq_snap;
+ unsigned long idt_nmi_seq_snap;
+ unsigned long idt_ignored_snap;
+ long idt_calls_snap;
};
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
irqentry_state_t irq_state;
+ struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
* cause nested NMIs, but those can be handled safely.
*/
sev_es_nmi_complete();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
+ arch_atomic_long_inc(&nsp->idt_calls);
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
nmi_restart:
/*
@@ -509,8 +526,19 @@ nmi_restart:
inc_irq_stat(__nmi_count);
- if (!ignore_nmis)
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
+ WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
+ } else if (!ignore_nmis) {
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
+ }
default_do_nmi(regs);
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
+ WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
+ }
+ }
irqentry_nmi_exit(regs, irq_state);
@@ -525,16 +553,94 @@ nmi_restart:
if (user_mode(regs))
mds_user_clear_cpu_buffers();
+ if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
+ WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
+ WARN_ON_ONCE(nsp->idt_seq & 0x1);
+ WRITE_ONCE(nsp->recv_jiffies, jiffies);
+ }
}
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
-#endif
#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
+#endif
+#endif
+
+#ifdef CONFIG_NMI_CHECK_CPU
+
+static char *nmi_check_stall_msg[] = {
+/* */
+/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
+/* | +------ cpu_is_offline(cpu) */
+/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
+/* | | | NMI handler has been invoked. */
+/* | | | */
+/* V V V */
+/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler",
+/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs",
+/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler",
+/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs",
+/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler",
+/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs",
+/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler",
+/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs",
+};
+
+void nmi_backtrace_stall_snap(const struct cpumask *btp)
+{
+ int cpu;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq);
+ nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq);
+ nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored);
+ nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls);
+ }
+}
+
+void nmi_backtrace_stall_check(const struct cpumask *btp)
+{
+ int cpu;
+ int idx;
+ unsigned long nmi_seq;
+ unsigned long j = jiffies;
+ char *modp;
+ char *msgp;
+ char *msghp;
+ struct nmi_stats *nsp;
+
+ for_each_cpu(cpu, btp) {
+ nsp = per_cpu_ptr(&nmi_stats, cpu);
+ modp = "";
+ msghp = "";
+ nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
+ msgp = "CPU entered NMI handler function, but has not exited";
+ } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
+ msgp = "CPU is handling NMIs";
+ } else {
+ idx = ((nsp->idt_seq_snap & 0x1) << 2) |
+ (cpu_is_offline(cpu) << 1) |
+ (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
+ msgp = nmi_check_stall_msg[idx];
+ if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
+ modp = ", but OK because ignore_nmis was set";
+ if (nmi_seq & ~0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ msghp = " (CPU exited one NMI handler function)";
+ }
+ pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
+ __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
+ }
+}
+
#endif
void stop_nmi(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7ca2d46c08cc..ac10b46c5832 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -32,32 +32,16 @@
#include <asm/special_insns.h>
#include <asm/tlb.h>
#include <asm/io_bitmap.h>
+#include <asm/gsseg.h>
/*
* nop stub, which must not clobber anything *including the stack* to
* avoid confusing the entry prologues.
*/
-extern void _paravirt_nop(void);
-asm (".pushsection .entry.text, \"ax\"\n"
- ".global _paravirt_nop\n"
- "_paravirt_nop:\n\t"
- ASM_ENDBR
- ASM_RET
- ".size _paravirt_nop, . - _paravirt_nop\n\t"
- ".type _paravirt_nop, @function\n\t"
- ".popsection");
+DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
/* stub always returning 0. */
-asm (".pushsection .entry.text, \"ax\"\n"
- ".global paravirt_ret0\n"
- "paravirt_ret0:\n\t"
- ASM_ENDBR
- "xor %" _ASM_AX ", %" _ASM_AX ";\n\t"
- ASM_RET
- ".size paravirt_ret0, . - paravirt_ret0\n\t"
- ".type paravirt_ret0, @function\n\t"
- ".popsection");
-
+DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -80,11 +64,11 @@ static unsigned paravirt_patch_call(void *insn_buff, const void *target,
}
#ifdef CONFIG_PARAVIRT_XXL
-/* identity function, which can be inlined */
-u64 notrace _paravirt_ident_64(u64 x)
-{
- return x;
-}
+DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -213,11 +197,6 @@ void paravirt_end_context_switch(struct task_struct *next)
arch_enter_lazy_mmu_mode();
}
-static noinstr unsigned long pv_native_read_cr2(void)
-{
- return native_read_cr2();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -233,14 +212,14 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
native_set_debugreg(regno, val);
}
-static noinstr void pv_native_irq_enable(void)
+noinstr void pv_native_wbinvd(void)
{
- native_irq_enable();
+ native_wbinvd();
}
-static noinstr void pv_native_irq_disable(void)
+static noinstr void pv_native_safe_halt(void)
{
- native_irq_disable();
+ native_safe_halt();
}
#endif
@@ -273,7 +252,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
- .cpu.wbinvd = native_wbinvd,
+ .cpu.wbinvd = pv_native_wbinvd,
.cpu.read_msr = native_read_msr,
.cpu.write_msr = native_write_msr,
.cpu.read_msr_safe = native_read_msr_safe,
@@ -304,10 +283,10 @@ struct paravirt_patch_template pv_ops = {
.cpu.end_context_switch = paravirt_nop,
/* Irq ops. */
- .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
- .irq.safe_halt = native_safe_halt,
+ .irq.safe_halt = pv_native_safe_halt,
.irq.halt = native_halt,
#endif /* CONFIG_PARAVIRT_XXL */
@@ -369,8 +348,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.make_pte = PTE_IDENT,
.mmu.make_pgd = PTE_IDENT,
- .mmu.dup_mmap = paravirt_nop,
- .mmu.activate_mm = paravirt_nop,
+ .mmu.enter_mmap = paravirt_nop,
.mmu.lazy_mode = {
.enter = paravirt_nop,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 30bbe4abb5d6..de6be0a3965e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -124,7 +124,7 @@ void __init pci_iommu_alloc(void)
}
/*
- * See <Documentation/x86/x86_64/boot-options.rst> for the iommu kernel
+ * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel
* parameter documentation.
*/
static __init int iommu_setup(char *p)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c21b7347a26d..dac41a0072ea 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -24,6 +25,7 @@
#include <linux/cpuidle.h>
#include <linux/acpi.h>
#include <linux/elf-randomize.h>
+#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
@@ -47,6 +49,7 @@
#include <asm/frame.h>
#include <asm/unwind.h>
#include <asm/tdx.h>
+#include <asm/mmu_context.h>
#include "process.h"
@@ -161,6 +164,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+
+ if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
+ set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
savesegment(gs, p->thread.gs);
@@ -367,6 +373,8 @@ void arch_setup_new_exec(void)
task_clear_spec_ssb_noexec(current);
speculation_ctrl_update(read_thread_flags());
}
+
+ mm_reset_untag_mask(current->mm);
}
#ifdef CONFIG_X86_IOPL_IOPERM
@@ -600,7 +608,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
}
if (updmsr)
- write_spec_ctrl_current(msr, false);
+ update_spec_ctrl_cond(msr);
}
static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
@@ -694,10 +702,27 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
-static void (*x86_idle)(void);
+/*
+ * We use this if we don't have any better idle routine..
+ */
+void __cpuidle default_idle(void)
+{
+ raw_safe_halt();
+ raw_local_irq_disable();
+}
+#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
+EXPORT_SYMBOL(default_idle);
+#endif
+
+DEFINE_STATIC_CALL_NULL(x86_idle, default_idle);
+
+static bool x86_idle_set(void)
+{
+ return !!static_call_query(x86_idle);
+}
#ifndef CONFIG_SMP
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
BUG();
}
@@ -709,7 +734,7 @@ void arch_cpu_idle_enter(void)
local_touch_nmi();
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
@@ -717,28 +742,18 @@ void arch_cpu_idle_dead(void)
/*
* Called from the generic idle code.
*/
-void arch_cpu_idle(void)
+void __cpuidle arch_cpu_idle(void)
{
- x86_idle();
+ static_call(x86_idle)();
}
-
-/*
- * We use this if we don't have any better idle routine..
- */
-void __cpuidle default_idle(void)
-{
- raw_safe_halt();
-}
-#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
-EXPORT_SYMBOL(default_idle);
-#endif
+EXPORT_SYMBOL_GPL(arch_cpu_idle);
#ifdef CONFIG_XEN
bool xen_set_default_idle(void)
{
- bool ret = !!x86_idle;
+ bool ret = x86_idle_set();
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
return ret;
}
@@ -800,13 +815,7 @@ static void amd_e400_idle(void)
default_idle();
- /*
- * The switch back from broadcast mode needs to be called with
- * interrupts disabled.
- */
- raw_local_irq_disable();
tick_broadcast_exit();
- raw_local_irq_enable();
}
/*
@@ -864,12 +873,10 @@ static __cpuidle void mwait_idle(void)
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
- if (!need_resched())
+ if (!need_resched()) {
__sti_mwait(0, 0);
- else
- raw_local_irq_enable();
- } else {
- raw_local_irq_enable();
+ raw_local_irq_disable();
+ }
}
__current_clr_polling();
}
@@ -880,20 +887,20 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
#endif
- if (x86_idle || boot_option_idle_override == IDLE_POLL)
+ if (x86_idle_set() || boot_option_idle_override == IDLE_POLL)
return;
if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
pr_info("using AMD E400 aware idle routine\n");
- x86_idle = amd_e400_idle;
+ static_call_update(x86_idle, amd_e400_idle);
} else if (prefer_mwait_c1_over_halt(c)) {
pr_info("using mwait in idle threads\n");
- x86_idle = mwait_idle;
+ static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- x86_idle = tdx_safe_halt;
+ static_call_update(x86_idle, tdx_safe_halt);
} else
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
}
void amd_e400_c1e_apic_setup(void)
@@ -946,7 +953,7 @@ static int __init idle_setup(char *str)
* To continue to load the CPU idle driver, don't touch
* the boot_option_idle_override.
*/
- x86_idle = default_idle;
+ static_call_update(x86_idle, default_idle);
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
@@ -965,7 +972,7 @@ early_param("idle", idle_setup);
unsigned long arch_align_stack(unsigned long sp)
{
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= prandom_u32_max(8192);
+ sp -= get_random_u32_below(8192);
return sp & ~0xf;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2f314b170c9f..708c87b88cc1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -191,13 +191,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
arch_end_context_switch(next_p);
/*
- * Reload esp0 and cpu_current_top_of_stack. This changes
+ * Reload esp0 and pcpu_hot.top_of_stack. This changes
* current_thread_info(). Refresh the SYSENTER configuration in
* case prev or next is vm86.
*/
update_task_stack(next_p);
refresh_sysenter_cs(next);
- this_cpu_write(cpu_current_top_of_stack,
+ this_cpu_write(pcpu_hot.top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
@@ -207,12 +207,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
loadsegment(gs, next->gs);
- this_cpu_write(current_task, next_p);
+ raw_cpu_write(pcpu_hot.current_task, next_p);
switch_fpu_finish();
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in();
+ resctrl_sched_in(next_p);
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6b3418bff326..3d181c16a2f6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
#include <linux/io.h>
#include <linux/ftrace.h>
#include <linux/syscalls.h>
+#include <linux/iommu.h>
#include <asm/processor.h>
#include <asm/pkru.h>
@@ -165,7 +166,7 @@ static noinstr unsigned long __rdgsbase_inactive(void)
lockdep_assert_irqs_disabled();
- if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
gsbase = rdgsbase();
native_swapgs();
@@ -190,7 +191,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
{
lockdep_assert_irqs_disabled();
- if (!static_cpu_has(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
wrgsbase(gsbase);
native_swapgs();
@@ -563,7 +564,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
- this_cpu_read(hardirq_stack_inuse));
+ this_cpu_read(pcpu_hot.hardirq_stack_inuse));
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);
@@ -617,8 +618,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- this_cpu_write(current_task, next_p);
- this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
+ raw_cpu_write(pcpu_hot.current_task, next_p);
+ raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
switch_fpu_finish();
@@ -656,7 +657,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
}
/* Load the Intel cache allocation PQR MSR. */
- resctrl_sched_in();
+ resctrl_sched_in(next_p);
return prev_p;
}
@@ -671,7 +672,7 @@ void set_personality_64bit(void)
task_pt_regs(current)->orig_ax = __NR_execve;
current_thread_info()->status &= ~TS_COMPAT;
if (current->mm)
- current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
+ __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
@@ -708,7 +709,7 @@ static void __set_personality_ia32(void)
* uprobes applied to this MM need to know this and
* cannot use user_64bit_mode() at that time.
*/
- current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
+ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
}
current->personality |= force_personality32;
@@ -743,6 +744,52 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
}
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+
+#define LAM_U57_BITS 6
+
+static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return -ENODEV;
+
+ /* PTRACE_ARCH_PRCTL */
+ if (current->mm != mm)
+ return -EINVAL;
+
+ if (mm_valid_pasid(mm) &&
+ !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
+ return -EINVAL;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
+ mmap_write_unlock(mm);
+ return -EBUSY;
+ }
+
+ if (!nr_bits) {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ } else if (nr_bits <= LAM_U57_BITS) {
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+ } else {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ }
+
+ write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
+ set_tlbstate_lam_mode(mm);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+
+ mmap_write_unlock(mm);
+
+ return 0;
+}
+#endif
+
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
@@ -830,7 +877,23 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
case ARCH_MAP_VDSO_64:
return prctl_map_vdso(&vdso_image_64, arg2);
#endif
-
+#ifdef CONFIG_ADDRESS_MASKING
+ case ARCH_GET_UNTAG_MASK:
+ return put_user(task->mm->context.untag_mask,
+ (unsigned long __user *)arg2);
+ case ARCH_ENABLE_TAGGED_ADDR:
+ return prctl_enable_tagged_addr(task->mm, arg2);
+ case ARCH_FORCE_TAGGED_SVA:
+ if (current != task)
+ return -EINVAL;
+ set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
+ return 0;
+ case ARCH_GET_MAX_TAG_BITS:
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return put_user(0, (unsigned long __user *)arg2);
+ else
+ return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
+#endif
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 37c12fb92906..dfaa270a7cc9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -44,16 +44,35 @@
#include "tls.h"
-enum x86_regset {
- REGSET_GENERAL,
- REGSET_FP,
- REGSET_XFP,
- REGSET_IOPERM64 = REGSET_XFP,
- REGSET_XSTATE,
- REGSET_TLS,
- REGSET_IOPERM32,
+enum x86_regset_32 {
+ REGSET32_GENERAL,
+ REGSET32_FP,
+ REGSET32_XFP,
+ REGSET32_XSTATE,
+ REGSET32_TLS,
+ REGSET32_IOPERM,
};
+enum x86_regset_64 {
+ REGSET64_GENERAL,
+ REGSET64_FP,
+ REGSET64_IOPERM,
+ REGSET64_XSTATE,
+};
+
+#define REGSET_GENERAL \
+({ \
+ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \
+ REGSET32_GENERAL; \
+})
+
+#define REGSET_FP \
+({ \
+ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \
+ REGSET32_FP; \
+})
+
+
struct pt_regs_offset {
const char *name;
int offset;
@@ -788,13 +807,13 @@ long arch_ptrace(struct task_struct *child, long request,
#ifdef CONFIG_X86_32
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP,
+ REGSET32_XFP,
0, sizeof(struct user_fxsr_struct),
datap) ? -EIO : 0;
#endif
@@ -1086,13 +1105,13 @@ static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
+ REGSET32_XFP, 0,
sizeof(struct user32_fxsr_struct),
datap);
@@ -1215,29 +1234,38 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
static struct user_regset x86_64_regsets[] __ro_after_init = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .regset_get = genregs_get, .set = genregs_set
+ [REGSET64_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .regset_get = genregs_get,
+ .set = genregs_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct fxregs_state) / sizeof(long),
- .size = sizeof(long), .align = sizeof(long),
- .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
+ [REGSET64_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct fxregs_state) / sizeof(long),
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .regset_get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET64_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_IOPERM64] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_LONGS,
- .size = sizeof(long), .align = sizeof(long),
- .active = ioperm_active, .regset_get = ioperm_get
+ [REGSET64_IOPERM] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_LONGS,
+ .size = sizeof(long),
+ .align = sizeof(long),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1256,43 +1284,57 @@ static const struct user_regset_view user_x86_64_view = {
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static struct user_regset x86_32_regsets[] __ro_after_init = {
- [REGSET_GENERAL] = {
- .core_note_type = NT_PRSTATUS,
- .n = sizeof(struct user_regs_struct32) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .regset_get = genregs32_get, .set = genregs32_set
+ [REGSET32_GENERAL] = {
+ .core_note_type = NT_PRSTATUS,
+ .n = sizeof(struct user_regs_struct32) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = genregs32_get,
+ .set = genregs32_set
},
- [REGSET_FP] = {
- .core_note_type = NT_PRFPREG,
- .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set
+ [REGSET32_FP] = {
+ .core_note_type = NT_PRFPREG,
+ .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_fpregs_active,
+ .regset_get = fpregs_get,
+ .set = fpregs_set
},
- [REGSET_XFP] = {
- .core_note_type = NT_PRXFPREG,
- .n = sizeof(struct fxregs_state) / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set
+ [REGSET32_XFP] = {
+ .core_note_type = NT_PRXFPREG,
+ .n = sizeof(struct fxregs_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = regset_xregset_fpregs_active,
+ .regset_get = xfpregs_get,
+ .set = xfpregs_set
},
- [REGSET_XSTATE] = {
- .core_note_type = NT_X86_XSTATE,
- .size = sizeof(u64), .align = sizeof(u64),
- .active = xstateregs_active, .regset_get = xstateregs_get,
- .set = xstateregs_set
+ [REGSET32_XSTATE] = {
+ .core_note_type = NT_X86_XSTATE,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = xstateregs_active,
+ .regset_get = xstateregs_get,
+ .set = xstateregs_set
},
- [REGSET_TLS] = {
- .core_note_type = NT_386_TLS,
- .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
- .size = sizeof(struct user_desc),
- .align = sizeof(struct user_desc),
- .active = regset_tls_active,
- .regset_get = regset_tls_get, .set = regset_tls_set
+ [REGSET32_TLS] = {
+ .core_note_type = NT_386_TLS,
+ .n = GDT_ENTRY_TLS_ENTRIES,
+ .bias = GDT_ENTRY_TLS_MIN,
+ .size = sizeof(struct user_desc),
+ .align = sizeof(struct user_desc),
+ .active = regset_tls_active,
+ .regset_get = regset_tls_get,
+ .set = regset_tls_set
},
- [REGSET_IOPERM32] = {
- .core_note_type = NT_386_IOPERM,
- .n = IO_BITMAP_BYTES / sizeof(u32),
- .size = sizeof(u32), .align = sizeof(u32),
- .active = ioperm_active, .regset_get = ioperm_get
+ [REGSET32_IOPERM] = {
+ .core_note_type = NT_386_IOPERM,
+ .n = IO_BITMAP_BYTES / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .active = ioperm_active,
+ .regset_get = ioperm_get
},
};
@@ -1311,10 +1353,10 @@ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
- x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64);
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
- x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
+ x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64);
#endif
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index eda37df016f0..56acf53a782a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -64,7 +64,8 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
return flags & valid_flags;
}
-u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+static __always_inline
+u64 __pvclock_clocksource_read(struct pvclock_vcpu_time_info *src, bool dowd)
{
unsigned version;
u64 ret;
@@ -77,7 +78,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
flags = src->flags;
} while (pvclock_read_retry(src, version));
- if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ if (dowd && unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
}
@@ -100,16 +101,25 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
* updating at the same time, and one of them could be slightly behind,
* making the assumption that last_value always go forward fail to hold.
*/
- last = atomic64_read(&last_value);
+ last = arch_atomic64_read(&last_value);
do {
- if (ret < last)
+ if (ret <= last)
return last;
- last = atomic64_cmpxchg(&last_value, last, ret);
- } while (unlikely(last != ret));
+ } while (!arch_atomic64_try_cmpxchg(&last_value, &last, ret));
return ret;
}
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, true);
+}
+
+noinstr u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src)
+{
+ return __pvclock_clocksource_read(src, false);
+}
+
void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
struct pvclock_vcpu_time_info *vcpu_time,
struct timespec64 *ts)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..3adbe97015c1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,33 +528,29 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
-static void emergency_vmx_disable_all(void)
+static void emergency_reboot_disable_virtualization(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
- * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
- * the machine, because the CPU blocks INIT when it's in VMX root.
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
* We can't take any locks and we may be on an inconsistent state, so
- * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
- * doesn't prevent a different CPU from being in VMX root operation.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx()) {
- /* Safely force _this_ CPU out of VMX root operation. */
- __cpu_emergency_vmxoff();
+ if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
- /* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -590,7 +586,7 @@ static void native_machine_emergency_restart(void)
unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
tboot_shutdown(TB_SHUTDOWN_REBOOT);
@@ -795,6 +791,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +824,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +842,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +895,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -882,7 +920,7 @@ void run_crash_ipi_callback(struct pt_regs *regs)
}
/* Override the weak function in kernel/panic.c */
-void nmi_panic_self_stop(struct pt_regs *regs)
+void __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
while (1) {
/* If no CPU is preparing crash dump, we simply loop here. */
@@ -897,6 +935,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4809c0dc4eb0..56cab1bb25f5 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -41,8 +41,9 @@
.text
.align PAGE_SIZE
.code64
+SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
@@ -112,7 +113,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
@@ -230,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
@@ -255,8 +256,8 @@ SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
- UNWIND_HINT_EMPTY
- movq %rdi, %rcx /* Put the page_list in %rcx */
+ UNWIND_HINT_END_OF_STACK
+ movq %rdi, %rcx /* Put the page_list in %rcx */
xorl %edi, %edi
xorl %esi, %esi
jmp 1f
@@ -312,5 +313,5 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
int3
SYM_CODE_END(swap_pages)
- .globl kexec_control_code_size
-.set kexec_control_code_size, . - relocate_kernel
+ .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
+SYM_CODE_END(relocate_range);
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index bba1abd05bfe..79bc8a97a083 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -42,8 +42,16 @@ static void remove_e820_regions(struct resource *avail)
resource_clip(avail, e820_start, e820_end);
if (orig.start != avail->start || orig.end != avail->end) {
- pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n",
- &orig, avail, e820_start, e820_end);
+ pr_info("resource: avoiding allocation from e820 entry [mem %#010Lx-%#010Lx]\n",
+ e820_start, e820_end);
+ if (avail->end > avail->start)
+ /*
+ * Use %pa instead of %pR because "avail"
+ * is typically IORESOURCE_UNSET, so %pR
+ * shows the size instead of addresses.
+ */
+ pr_info("resource: remaining [mem %pa-%pa] available\n",
+ &avail->start, &avail->end);
orig = *avail;
}
}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 349046434513..1309b9b05338 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -138,15 +138,12 @@ static __init int add_rtc_cmos(void)
static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
- struct pnp_id *id;
int i;
pnp_for_each_dev(dev) {
- for (id = dev->id; id; id = id->next) {
- for (i = 0; i < ARRAY_SIZE(ids); i++) {
- if (compare_pnp_id(id, ids[i]) != 0)
- return 0;
- }
+ for (i = 0; i < ARRAY_SIZE(ids); i++) {
+ if (compare_pnp_id(dev->id, ids[i]) != 0)
+ return 0;
}
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 216fee7144ee..16babff771bd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,9 +31,11 @@
#include <xen/xen.h>
#include <asm/apic.h>
+#include <asm/efi.h>
#include <asm/numa.h>
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
+#include <asm/cacheinfo.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
@@ -112,11 +114,6 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
-
-/* Common CPU data for all CPUs */
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
-
unsigned int def_to_bigsmp;
struct apm_info apm_info;
@@ -130,11 +127,10 @@ EXPORT_SYMBOL(ist_info);
struct ist_info ist_info;
#endif
-#else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
-EXPORT_SYMBOL(boot_cpu_data);
#endif
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
__visible unsigned long mmu_cr4_features __ro_after_init;
@@ -1074,24 +1070,13 @@ void __init setup_arch(char **cmdline_p)
max_pfn = e820__end_of_ram_pfn();
/* update e820 for memory not covered by WB MTRRs */
- if (IS_ENABLED(CONFIG_MTRR))
- mtrr_bp_init();
- else
- pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
-
+ cache_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
max_pfn = e820__end_of_ram_pfn();
max_possible_pfn = max_pfn;
/*
- * This call is required when the CPU does not support PAT. If
- * mtrr_bp_init() invoked it already via pat_init() the call has no
- * effect.
- */
- init_cache_modes();
-
- /*
* Define random base addresses for memory sections after max_pfn is
* defined and before each memory section base is used.
*/
@@ -1175,7 +1160,7 @@ void __init setup_arch(char **cmdline_p)
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
*/
- reserve_real_mode();
+ x86_platform.realmode_reserve();
init_mem_mapping();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 49325caa7307..c242dc47e9cb 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/pfn.h>
+#include <linux/stackprotector.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/desc.h>
@@ -21,10 +22,6 @@
#include <asm/proto.h>
#include <asm/cpumask.h>
#include <asm/cpu.h>
-#include <asm/stackprotector.h>
-
-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
@@ -172,7 +169,7 @@ void __init setup_per_cpu_areas(void)
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
- per_cpu(cpu_number, cpu) = cpu;
+ per_cpu(pcpu_hot.cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
/*
* Copy data used in early init routines from the
@@ -211,7 +208,7 @@ void __init setup_per_cpu_areas(void)
* area. Reload any changed state for the boot CPU.
*/
if (!cpu)
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
}
/* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a428c62330d3..b031244d6d2d 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -22,6 +22,8 @@
#include <linux/efi.h>
#include <linux/platform_device.h>
#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
@@ -1536,32 +1538,32 @@ static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
{
struct insn *insn = &ctxt->insn;
+ enum insn_mmio_type mmio;
unsigned int bytes = 0;
- enum mmio_type mmio;
enum es_result ret;
u8 sign_byte;
long *reg_data;
mmio = insn_decode_mmio(insn, &bytes);
- if (mmio == MMIO_DECODE_FAILED)
+ if (mmio == INSN_MMIO_DECODE_FAILED)
return ES_DECODE_FAILED;
- if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS) {
+ if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
if (!reg_data)
return ES_DECODE_FAILED;
}
switch (mmio) {
- case MMIO_WRITE:
+ case INSN_MMIO_WRITE:
memcpy(ghcb->shared_buffer, reg_data, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
- case MMIO_WRITE_IMM:
+ case INSN_MMIO_WRITE_IMM:
memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
ret = vc_do_mmio(ghcb, ctxt, bytes, false);
break;
- case MMIO_READ:
+ case INSN_MMIO_READ:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1572,7 +1574,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_READ_ZERO_EXTEND:
+ case INSN_MMIO_READ_ZERO_EXTEND:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1581,7 +1583,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memset(reg_data, 0, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_READ_SIGN_EXTEND:
+ case INSN_MMIO_READ_SIGN_EXTEND:
ret = vc_do_mmio(ghcb, ctxt, bytes, true);
if (ret)
break;
@@ -1600,7 +1602,7 @@ static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
memset(reg_data, sign_byte, insn->opnd_bytes);
memcpy(reg_data, ghcb->shared_buffer, bytes);
break;
- case MMIO_MOVS:
+ case INSN_MMIO_MOVS:
ret = vc_handle_mmio_movs(ctxt, bytes);
break;
default:
@@ -2175,7 +2177,7 @@ static int __init init_sev_config(char *str)
}
__setup("sev=", init_sev_config);
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
struct ghcb_state state;
struct es_em_ctxt ctxt;
@@ -2183,11 +2185,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
struct ghcb *ghcb;
int ret;
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return -ENODEV;
-
- if (!fw_err)
- return -EINVAL;
+ rio->exitinfo2 = SEV_RET_NO_FW_CALL;
/*
* __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -2212,15 +2210,26 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
if (ret)
goto e_put;
- if (ghcb->save.sw_exit_info_2) {
- /* Number of expected pages are returned in RBX */
- if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST &&
- ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN)
- input->data_npages = ghcb_get_rbx(ghcb);
+ rio->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (rio->exitinfo2) {
+ case 0:
+ break;
- *fw_err = ghcb->save.sw_exit_info_2;
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
+ ret = -EAGAIN;
+ break;
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
+ /* Number of expected pages are returned in RBX */
+ if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
+ input->data_npages = ghcb_get_rbx(ghcb);
+ ret = -ENOSPC;
+ break;
+ }
+ fallthrough;
+ default:
ret = -EIO;
+ break;
}
e_put:
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9c7265b524c7..004cb30b7419 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -37,180 +37,27 @@
#include <asm/sighandling.h>
#include <asm/vm86.h>
-#ifdef CONFIG_X86_64
-#include <linux/compat.h>
-#include <asm/proto.h>
-#include <asm/ia32_unistd.h>
-#include <asm/fpu/xstate.h>
-#endif /* CONFIG_X86_64 */
-
#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
-#ifdef CONFIG_X86_64
-/*
- * If regs->ss will cause an IRET fault, change it. Otherwise leave it
- * alone. Using this generally makes no sense unless
- * user_64bit_mode(regs) would return true.
- */
-static void force_valid_ss(struct pt_regs *regs)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
- u32 ar;
- asm volatile ("lar %[old_ss], %[ar]\n\t"
- "jz 1f\n\t" /* If invalid: */
- "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
- "1:"
- : [ar] "=r" (ar)
- : [old_ss] "rm" ((u16)regs->ss));
-
- /*
- * For a valid 64-bit user context, we need DPL 3, type
- * read-write data or read-write exp-down data, and S and P
- * set. We can't use VERW because VERW doesn't check the
- * P bit.
- */
- ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
- if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
- ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
- regs->ss = __USER_DS;
+ return IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
}
-# define CONTEXT_COPY_SIZE offsetof(struct sigcontext, reserved1)
-#else
-# define CONTEXT_COPY_SIZE sizeof(struct sigcontext)
-#endif
-static bool restore_sigcontext(struct pt_regs *regs,
- struct sigcontext __user *usc,
- unsigned long uc_flags)
+static inline int is_ia32_frame(struct ksignal *ksig)
{
- struct sigcontext sc;
-
- /* Always make any pending restarted system calls return -EINTR */
- current->restart_block.fn = do_no_restart_syscall;
-
- if (copy_from_user(&sc, usc, CONTEXT_COPY_SIZE))
- return false;
-
-#ifdef CONFIG_X86_32
- loadsegment(gs, sc.gs);
- regs->fs = sc.fs;
- regs->es = sc.es;
- regs->ds = sc.ds;
-#endif /* CONFIG_X86_32 */
-
- regs->bx = sc.bx;
- regs->cx = sc.cx;
- regs->dx = sc.dx;
- regs->si = sc.si;
- regs->di = sc.di;
- regs->bp = sc.bp;
- regs->ax = sc.ax;
- regs->sp = sc.sp;
- regs->ip = sc.ip;
-
-#ifdef CONFIG_X86_64
- regs->r8 = sc.r8;
- regs->r9 = sc.r9;
- regs->r10 = sc.r10;
- regs->r11 = sc.r11;
- regs->r12 = sc.r12;
- regs->r13 = sc.r13;
- regs->r14 = sc.r14;
- regs->r15 = sc.r15;
-#endif /* CONFIG_X86_64 */
-
- /* Get CS/SS and force CPL3 */
- regs->cs = sc.cs | 0x03;
- regs->ss = sc.ss | 0x03;
-
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
- /* disable syscall checks */
- regs->orig_ax = -1;
-
-#ifdef CONFIG_X86_64
- /*
- * Fix up SS if needed for the benefit of old DOSEMU and
- * CRIU.
- */
- if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
- force_valid_ss(regs);
-#endif
-
- return fpu__restore_sig((void __user *)sc.fpstate,
- IS_ENABLED(CONFIG_X86_32));
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}
-static __always_inline int
-__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
- struct pt_regs *regs, unsigned long mask)
+static inline int is_x32_frame(struct ksignal *ksig)
{
-#ifdef CONFIG_X86_32
- unsigned int gs;
- savesegment(gs, gs);
-
- unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault);
- unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
- unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
- unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
-#endif /* CONFIG_X86_32 */
-
- unsafe_put_user(regs->di, &sc->di, Efault);
- unsafe_put_user(regs->si, &sc->si, Efault);
- unsafe_put_user(regs->bp, &sc->bp, Efault);
- unsafe_put_user(regs->sp, &sc->sp, Efault);
- unsafe_put_user(regs->bx, &sc->bx, Efault);
- unsafe_put_user(regs->dx, &sc->dx, Efault);
- unsafe_put_user(regs->cx, &sc->cx, Efault);
- unsafe_put_user(regs->ax, &sc->ax, Efault);
-#ifdef CONFIG_X86_64
- unsafe_put_user(regs->r8, &sc->r8, Efault);
- unsafe_put_user(regs->r9, &sc->r9, Efault);
- unsafe_put_user(regs->r10, &sc->r10, Efault);
- unsafe_put_user(regs->r11, &sc->r11, Efault);
- unsafe_put_user(regs->r12, &sc->r12, Efault);
- unsafe_put_user(regs->r13, &sc->r13, Efault);
- unsafe_put_user(regs->r14, &sc->r14, Efault);
- unsafe_put_user(regs->r15, &sc->r15, Efault);
-#endif /* CONFIG_X86_64 */
-
- unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
- unsafe_put_user(current->thread.error_code, &sc->err, Efault);
- unsafe_put_user(regs->ip, &sc->ip, Efault);
-#ifdef CONFIG_X86_32
- unsafe_put_user(regs->cs, (unsigned int __user *)&sc->cs, Efault);
- unsafe_put_user(regs->flags, &sc->flags, Efault);
- unsafe_put_user(regs->sp, &sc->sp_at_signal, Efault);
- unsafe_put_user(regs->ss, (unsigned int __user *)&sc->ss, Efault);
-#else /* !CONFIG_X86_32 */
- unsafe_put_user(regs->flags, &sc->flags, Efault);
- unsafe_put_user(regs->cs, &sc->cs, Efault);
- unsafe_put_user(0, &sc->gs, Efault);
- unsafe_put_user(0, &sc->fs, Efault);
- unsafe_put_user(regs->ss, &sc->ss, Efault);
-#endif /* CONFIG_X86_32 */
-
- unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
-
- /* non-iBCS2 extensions.. */
- unsafe_put_user(mask, &sc->oldmask, Efault);
- unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
- return 0;
-Efault:
- return -EFAULT;
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
-#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
-do { \
- if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
- goto label; \
-} while(0);
-
-#define unsafe_put_sigmask(set, frame, label) \
- unsafe_put_user(*(__u64 *)(set), \
- (__u64 __user *)&(frame)->uc.uc_sigmask, \
- label)
-
/*
* Set up a signal frame.
*/
@@ -223,24 +70,12 @@ do { \
/*
* Determine which stack to use..
*/
-static unsigned long align_sigframe(unsigned long sp)
-{
-#ifdef CONFIG_X86_32
- /*
- * Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0.
- */
- sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
-#else /* !CONFIG_X86_32 */
- sp = round_down(sp, FRAME_ALIGNMENT) - 8;
-#endif
- return sp;
-}
-
-static void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+void __user *
+get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
void __user **fpstate)
{
+ struct k_sigaction *ka = &ksig->ka;
+ int ia32_frame = is_ia32_frame(ksig);
/* Default to using normal stack */
bool nested_altstack = on_sig_stack(regs->sp);
bool entering_altstack = false;
@@ -249,7 +84,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
unsigned long buf_fx = 0;
/* redzone */
- if (IS_ENABLED(CONFIG_X86_64))
+ if (!ia32_frame)
sp -= 128;
/* This is the X/Open sanctioned signal stack switching. */
@@ -263,7 +98,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = current->sas_ss_sp + current->sas_ss_size;
entering_altstack = true;
}
- } else if (IS_ENABLED(CONFIG_X86_32) &&
+ } else if (ia32_frame &&
!nested_altstack &&
regs->ss != __USER_DS &&
!(ka->sa.sa_flags & SA_RESTORER) &&
@@ -273,11 +108,19 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
entering_altstack = true;
}
- sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
- &buf_fx, &math_size);
+ sp = fpu__alloc_mathframe(sp, ia32_frame, &buf_fx, &math_size);
*fpstate = (void __user *)sp;
- sp = align_sigframe(sp - frame_size);
+ sp -= frame_size;
+
+ if (ia32_frame)
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
+ sp = ((sp + 4) & -FRAME_ALIGNMENT) - 4;
+ else
+ sp = round_down(sp, FRAME_ALIGNMENT) - 8;
/*
* If we are on the alternate signal stack and would overflow it, don't.
@@ -300,391 +143,6 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
return (void __user *)sp;
}
-#ifdef CONFIG_X86_32
-static const struct {
- u16 poplmovl;
- u32 val;
- u16 int80;
-} __attribute__((packed)) retcode = {
- 0xb858, /* popl %eax; movl $..., %eax */
- __NR_sigreturn,
- 0x80cd, /* int $0x80 */
-};
-
-static const struct {
- u8 movl;
- u32 val;
- u16 int80;
- u8 pad;
-} __attribute__((packed)) rt_retcode = {
- 0xb8, /* movl $..., %eax */
- __NR_rt_sigreturn,
- 0x80cd, /* int $0x80 */
- 0
-};
-
-static int
-__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
- struct pt_regs *regs)
-{
- struct sigframe __user *frame;
- void __user *restorer;
- void __user *fp = NULL;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- unsafe_put_user(sig, &frame->sig, Efault);
- unsafe_put_sigcontext(&frame->sc, fp, regs, set, Efault);
- unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
- if (current->mm->context.vdso)
- restorer = current->mm->context.vdso +
- vdso_image_32.sym___kernel_sigreturn;
- else
- restorer = &frame->retcode;
- if (ksig->ka.sa.sa_flags & SA_RESTORER)
- restorer = ksig->ka.sa.sa_restorer;
-
- /* Set up to return from userspace. */
- unsafe_put_user(restorer, &frame->pretcode, Efault);
-
- /*
- * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- unsafe_put_user(*((u64 *)&retcode), (u64 *)frame->retcode, Efault);
- user_access_end();
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = 0;
- regs->cx = 0;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-
-Efault:
- user_access_end();
- return -EFAULT;
-}
-
-static int __setup_rt_frame(int sig, struct ksignal *ksig,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *restorer;
- void __user *fp = NULL;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- unsafe_put_user(sig, &frame->sig, Efault);
- unsafe_put_user(&frame->info, &frame->pinfo, Efault);
- unsafe_put_user(&frame->uc, &frame->puc, Efault);
-
- /* Create the ucontext. */
- if (static_cpu_has(X86_FEATURE_XSAVE))
- unsafe_put_user(UC_FP_XSTATE, &frame->uc.uc_flags, Efault);
- else
- unsafe_put_user(0, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
-
- /* Set up to return from userspace. */
- restorer = current->mm->context.vdso +
- vdso_image_32.sym___kernel_rt_sigreturn;
- if (ksig->ka.sa.sa_flags & SA_RESTORER)
- restorer = ksig->ka.sa.sa_restorer;
- unsafe_put_user(restorer, &frame->pretcode, Efault);
-
- /*
- * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
- *
- * WE DO NOT USE IT ANY MORE! It's only left here for historical
- * reasons and because gdb uses it as a signature to notice
- * signal handler stack frames.
- */
- unsafe_put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long)frame;
- regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
- regs->ax = (unsigned long)sig;
- regs->dx = (unsigned long)&frame->info;
- regs->cx = (unsigned long)&frame->uc;
-
- regs->ds = __USER_DS;
- regs->es = __USER_DS;
- regs->ss = __USER_DS;
- regs->cs = __USER_CS;
-
- return 0;
-Efault:
- user_access_end();
- return -EFAULT;
-}
-#else /* !CONFIG_X86_32 */
-static unsigned long frame_uc_flags(struct pt_regs *regs)
-{
- unsigned long flags;
-
- if (boot_cpu_has(X86_FEATURE_XSAVE))
- flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
- else
- flags = UC_SIGCONTEXT_SS;
-
- if (likely(user_64bit_mode(regs)))
- flags |= UC_STRICT_RESTORE_SS;
-
- return flags;
-}
-
-static int __setup_rt_frame(int sig, struct ksignal *ksig,
- sigset_t *set, struct pt_regs *regs)
-{
- struct rt_sigframe __user *frame;
- void __user *fp = NULL;
- unsigned long uc_flags;
-
- /* x86-64 should always use SA_RESTORER. */
- if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
- return -EFAULT;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
- uc_flags = frame_uc_flags(regs);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- /* Create the ucontext. */
- unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
-
- /* Set up to return from userspace. If provided, use a stub
- already in userspace. */
- unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
- }
-
- /* Set up registers for signal handler */
- regs->di = sig;
- /* In case the signal handler was declared without prototypes */
- regs->ax = 0;
-
- /* This also works for non SA_SIGINFO handlers because they expect the
- next argument after the signal number on the stack. */
- regs->si = (unsigned long)&frame->info;
- regs->dx = (unsigned long)&frame->uc;
- regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
-
- regs->sp = (unsigned long)frame;
-
- /*
- * Set up the CS and SS registers to run signal handlers in
- * 64-bit mode, even if the handler happens to be interrupting
- * 32-bit or 16-bit code.
- *
- * SS is subtle. In 64-bit mode, we don't need any particular
- * SS descriptor, but we do need SS to be valid. It's possible
- * that the old SS is entirely bogus -- this can happen if the
- * signal we're trying to deliver is #GP or #SS caused by a bad
- * SS value. We also have a compatibility issue here: DOSEMU
- * relies on the contents of the SS register indicating the
- * SS value at the time of the signal, even though that code in
- * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
- * avoids relying on sigreturn to restore SS; instead it uses
- * a trampoline.) So we do our best: if the old SS was valid,
- * we keep it. Otherwise we replace it.
- */
- regs->cs = __USER_CS;
-
- if (unlikely(regs->ss != __USER_DS))
- force_valid_ss(regs);
-
- return 0;
-
-Efault:
- user_access_end();
- return -EFAULT;
-}
-#endif /* CONFIG_X86_32 */
-
-#ifdef CONFIG_X86_X32_ABI
-static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from)
-{
- struct compat_siginfo new;
-
- copy_siginfo_to_external32(&new, from);
- if (from->si_signo == SIGCHLD) {
- new._sifields._sigchld_x32._utime = from->si_utime;
- new._sifields._sigchld_x32._stime = from->si_stime;
- }
- if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
- return -EFAULT;
- return 0;
-}
-
-int copy_siginfo_to_user32(struct compat_siginfo __user *to,
- const struct kernel_siginfo *from)
-{
- if (in_x32_syscall())
- return x32_copy_siginfo_to_user(to, from);
- return __copy_siginfo_to_user32(to, from);
-}
-#endif /* CONFIG_X86_X32_ABI */
-
-static int x32_setup_rt_frame(struct ksignal *ksig,
- compat_sigset_t *set,
- struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_X32_ABI
- struct rt_sigframe_x32 __user *frame;
- unsigned long uc_flags;
- void __user *restorer;
- void __user *fp = NULL;
-
- if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
- return -EFAULT;
-
- frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fp);
-
- uc_flags = frame_uc_flags(regs);
-
- if (!user_access_begin(frame, sizeof(*frame)))
- return -EFAULT;
-
- /* Create the ucontext. */
- unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
- unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
- unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
- restorer = ksig->ka.sa.sa_restorer;
- unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
- unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
- unsafe_put_sigmask(set, frame, Efault);
- user_access_end();
-
- if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
- return -EFAULT;
- }
-
- /* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
-
- /* We use the x32 calling convention here... */
- regs->di = ksig->sig;
- regs->si = (unsigned long) &frame->info;
- regs->dx = (unsigned long) &frame->uc;
-
- loadsegment(ds, __USER_DS);
- loadsegment(es, __USER_DS);
-
- regs->cs = __USER_CS;
- regs->ss = __USER_DS;
-#endif /* CONFIG_X86_X32_ABI */
-
- return 0;
-#ifdef CONFIG_X86_X32_ABI
-Efault:
- user_access_end();
- return -EFAULT;
-#endif
-}
-
-/*
- * Do a signal return; undo the signal stack.
- */
-#ifdef CONFIG_X86_32
-SYSCALL_DEFINE0(sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct sigframe __user *frame;
- sigset_t set;
-
- frame = (struct sigframe __user *)(regs->sp - 8);
-
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask) ||
- __get_user(set.sig[1], &frame->extramask[0]))
- goto badframe;
-
- set_current_blocked(&set);
-
- /*
- * x86_32 has no uc_flags bits relevant to restore_sigcontext.
- * Save a few cycles by skipping the __get_user.
- */
- if (!restore_sigcontext(regs, &frame->sc, 0))
- goto badframe;
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "sigreturn");
-
- return 0;
-}
-#endif /* CONFIG_X86_32 */
-
-SYSCALL_DEFINE0(rt_sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct rt_sigframe __user *frame;
- sigset_t set;
- unsigned long uc_flags;
-
- frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
- goto badframe;
- if (__get_user(uc_flags, &frame->uc.uc_flags))
- goto badframe;
-
- set_current_blocked(&set);
-
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
- goto badframe;
-
- if (restore_altstack(&frame->uc.uc_stack))
- goto badframe;
-
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "rt_sigreturn");
- return 0;
-}
-
/*
* There are four different struct types for signal frame: sigframe_ia32,
* rt_sigframe_ia32, rt_sigframe_x32, and rt_sigframe. Use the worst case
@@ -743,43 +201,22 @@ unsigned long get_sigframe_size(void)
return max_frame_size;
}
-static inline int is_ia32_compat_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_IA32_EMULATION) &&
- ksig->ka.sa.sa_flags & SA_IA32_ABI;
-}
-
-static inline int is_ia32_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
-}
-
-static inline int is_x32_frame(struct ksignal *ksig)
-{
- return IS_ENABLED(CONFIG_X86_X32_ABI) &&
- ksig->ka.sa.sa_flags & SA_X32_ABI;
-}
-
static int
setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = ksig->sig;
- sigset_t *set = sigmask_to_save();
- compat_sigset_t *cset = (compat_sigset_t *) set;
-
/* Perform fixup for the pre-signal frame. */
rseq_signal_deliver(ksig, regs);
/* Set up the stack frame */
if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
- return ia32_setup_rt_frame(usig, ksig, cset, regs);
+ return ia32_setup_rt_frame(ksig, regs);
else
- return ia32_setup_frame(usig, ksig, cset, regs);
+ return ia32_setup_frame(ksig, regs);
} else if (is_x32_frame(ksig)) {
- return x32_setup_rt_frame(ksig, cset, regs);
+ return x32_setup_rt_frame(ksig, regs);
} else {
- return __setup_rt_frame(ksig->sig, ksig, set, regs);
+ return x64_setup_rt_frame(ksig, regs);
}
}
@@ -923,7 +360,7 @@ static bool strict_sigaltstack_size __ro_after_init = false;
static int __init strict_sas_size(char *arg)
{
- return kstrtobool(arg, &strict_sigaltstack_size);
+ return kstrtobool(arg, &strict_sigaltstack_size) == 0;
}
__setup("strict_sas_size", strict_sas_size);
@@ -969,36 +406,3 @@ bool sigaltstack_size_valid(size_t ss_size)
return true;
}
#endif /* CONFIG_DYNAMIC_SIGFRAME */
-
-#ifdef CONFIG_X86_X32_ABI
-COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
-{
- struct pt_regs *regs = current_pt_regs();
- struct rt_sigframe_x32 __user *frame;
- sigset_t set;
- unsigned long uc_flags;
-
- frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
-
- if (!access_ok(frame, sizeof(*frame)))
- goto badframe;
- if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
- goto badframe;
- if (__get_user(uc_flags, &frame->uc.uc_flags))
- goto badframe;
-
- set_current_blocked(&set);
-
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
- goto badframe;
-
- if (compat_restore_altstack(&frame->uc.uc_stack))
- goto badframe;
-
- return regs->ax;
-
-badframe:
- signal_fault(regs, frame, "x32 rt_sigreturn");
- return 0;
-}
-#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/kernel/signal_32.c
index c9c3859322fa..9027fc088f97 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/kernel/signal_32.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/arch/x86_64/ia32/ia32_signal.c
- *
* Copyright (C) 1991, 1992 Linus Torvalds
*
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
@@ -26,7 +24,6 @@
#include <linux/uaccess.h>
#include <asm/fpu/signal.h>
#include <asm/ptrace.h>
-#include <asm/ia32_unistd.h>
#include <asm/user32.h>
#include <uapi/asm/sigcontext.h>
#include <asm/proto.h>
@@ -34,6 +31,10 @@
#include <asm/sigframe.h>
#include <asm/sighandling.h>
#include <asm/smap.h>
+#include <asm/gsseg.h>
+
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/ia32_unistd.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
@@ -53,6 +54,23 @@ static inline void reload_segments(struct sigcontext_32 *sc)
loadsegment(es, sc->es | 0x03);
}
+#define sigset32_t compat_sigset_t
+#define siginfo32_t compat_siginfo_t
+#define restore_altstack32 compat_restore_altstack
+#define unsafe_save_altstack32 unsafe_compat_save_altstack
+
+#else
+
+#define sigset32_t sigset_t
+#define siginfo32_t siginfo_t
+#define __NR_ia32_sigreturn __NR_sigreturn
+#define __NR_ia32_rt_sigreturn __NR_rt_sigreturn
+#define restore_altstack32 restore_altstack
+#define unsafe_save_altstack32 unsafe_save_altstack
+#define __copy_siginfo_to_user32 copy_siginfo_to_user
+
+#endif
+
/*
* Do a signal return; undo the signal stack.
*/
@@ -86,6 +104,7 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
/* disable syscall checks */
regs->orig_ax = -1;
+#ifdef CONFIG_IA32_EMULATION
/*
* Reload fs and gs if they have changed in the signal
* handler. This does not handle long fs/gs base changes in
@@ -93,10 +112,17 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs,
* normal case.
*/
reload_segments(&sc);
+#else
+ loadsegment(gs, sc.gs);
+ regs->fs = sc.fs;
+ regs->es = sc.es;
+ regs->ds = sc.ds;
+#endif
+
return fpu__restore_sig(compat_ptr(sc.fpstate), 1);
}
-COMPAT_SYSCALL_DEFINE0(sigreturn)
+SYSCALL32_DEFINE0(sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
@@ -119,7 +145,7 @@ badframe:
return 0;
}
-COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
+SYSCALL32_DEFINE0(rt_sigreturn)
{
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_ia32 __user *frame;
@@ -129,7 +155,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
if (!access_ok(frame, sizeof(*frame)))
goto badframe;
- if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
goto badframe;
set_current_blocked(&set);
@@ -137,7 +163,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
if (!ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
- if (compat_restore_altstack(&frame->uc.uc_stack))
+ if (restore_altstack32(&frame->uc.uc_stack))
goto badframe;
return regs->ax;
@@ -159,9 +185,15 @@ __unsafe_setup_sigcontext32(struct sigcontext_32 __user *sc,
struct pt_regs *regs, unsigned int mask)
{
unsafe_put_user(get_user_seg(gs), (unsigned int __user *)&sc->gs, Efault);
+#ifdef CONFIG_IA32_EMULATION
unsafe_put_user(get_user_seg(fs), (unsigned int __user *)&sc->fs, Efault);
unsafe_put_user(get_user_seg(ds), (unsigned int __user *)&sc->ds, Efault);
unsafe_put_user(get_user_seg(es), (unsigned int __user *)&sc->es, Efault);
+#else
+ unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault);
+ unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault);
+ unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault);
+#endif
unsafe_put_user(regs->di, &sc->di, Efault);
unsafe_put_user(regs->si, &sc->si, Efault);
@@ -196,43 +228,9 @@ do { \
goto label; \
} while(0)
-/*
- * Determine which stack to use..
- */
-static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
- size_t frame_size,
- void __user **fpstate)
-{
- unsigned long sp, fx_aligned, math_size;
-
- /* Default to using normal stack */
- sp = regs->sp;
-
- /* This is the X/Open sanctioned signal stack switching. */
- if (ksig->ka.sa.sa_flags & SA_ONSTACK)
- sp = sigsp(sp, ksig);
- /* This is the legacy signal stack switching. */
- else if (regs->ss != __USER32_DS &&
- !(ksig->ka.sa.sa_flags & SA_RESTORER) &&
- ksig->ka.sa.sa_restorer)
- sp = (unsigned long) ksig->ka.sa.sa_restorer;
-
- sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
- *fpstate = (struct _fpstate_32 __user *) sp;
- if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
- math_size))
- return (void __user *) -1L;
-
- sp -= frame_size;
- /* Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0. */
- sp = ((sp + 4) & -16ul) - 4;
- return (void __user *) sp;
-}
-
-int ia32_setup_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs)
+int ia32_setup_frame(struct ksignal *ksig, struct pt_regs *regs)
{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
struct sigframe_ia32 __user *frame;
void __user *restorer;
void __user *fp = NULL;
@@ -264,7 +262,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
- unsafe_put_user(sig, &frame->sig, Efault);
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
unsafe_put_sigcontext32(&frame->sc, fp, regs, set, Efault);
unsafe_put_user(set->sig[1], &frame->extramask[0], Efault);
unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
@@ -280,15 +278,20 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
- regs->ax = sig;
+ regs->ax = ksig->sig;
regs->dx = 0;
regs->cx = 0;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
+ regs->ss = __USER_DS;
return 0;
Efault:
@@ -296,9 +299,9 @@ Efault:
return -EFAULT;
}
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
- compat_sigset_t *set, struct pt_regs *regs)
+int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
+ sigset32_t *set = (sigset32_t *) sigmask_to_save();
struct rt_sigframe_ia32 __user *frame;
void __user *restorer;
void __user *fp = NULL;
@@ -321,7 +324,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
- unsafe_put_user(sig, &frame->sig, Efault);
+ unsafe_put_user(ksig->sig, &frame->sig, Efault);
unsafe_put_user(ptr_to_compat(&frame->info), &frame->pinfo, Efault);
unsafe_put_user(ptr_to_compat(&frame->uc), &frame->puc, Efault);
@@ -331,7 +334,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
else
unsafe_put_user(0, &frame->uc.uc_flags, Efault);
unsafe_put_user(0, &frame->uc.uc_link, Efault);
- unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_save_altstack32(&frame->uc.uc_stack, regs->sp, Efault);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
@@ -357,18 +360,148 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
/* Make -mregparm=3 work */
- regs->ax = sig;
+ regs->ax = ksig->sig;
regs->dx = (unsigned long) &frame->info;
regs->cx = (unsigned long) &frame->uc;
- loadsegment(ds, __USER32_DS);
- loadsegment(es, __USER32_DS);
+#ifdef CONFIG_IA32_EMULATION
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#else
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+#endif
regs->cs = __USER32_CS;
- regs->ss = __USER32_DS;
+ regs->ss = __USER_DS;
return 0;
Efault:
user_access_end();
return -EFAULT;
}
+
+/*
+ * The siginfo_t structure and handing code is very easy
+ * to break in several ways. It must always be updated when new
+ * updates are made to the main siginfo_t, and
+ * copy_siginfo_to_user32() must be updated when the
+ * (arch-independent) copy_siginfo_to_user() is updated.
+ *
+ * It is also easy to put a new member in the siginfo_t
+ * which has implicit alignment which can move internal structure
+ * alignment around breaking the ABI. This can happen if you,
+ * for instance, put a plain 64-bit value in there.
+ */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 9);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo32_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo32_t) == 4);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo32_t, _sifields) == 3 * sizeof(int));
+
+static_assert(offsetof(siginfo32_t, si_signo) == 0);
+static_assert(offsetof(siginfo32_t, si_errno) == 4);
+static_assert(offsetof(siginfo32_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo32_t, _sifields) == \
+ offsetof(siginfo32_t, _sifields.name))
+
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo32_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0xC);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+
+CHECK_SI_OFFSET(_timer);
+#ifdef CONFIG_COMPAT
+/* compat_siginfo_t doesn't have si_sys_private */
+CHECK_SI_SIZE (_timer, 3*sizeof(int));
+#else
+CHECK_SI_SIZE (_timer, 4*sizeof(int));
+#endif
+static_assert(offsetof(siginfo32_t, si_tid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_overrun) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_value) == 0x14);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 5*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_pid) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_uid) == 0x10);
+static_assert(offsetof(siginfo32_t, si_status) == 0x14);
+static_assert(offsetof(siginfo32_t, si_utime) == 0x18);
+static_assert(offsetof(siginfo32_t, si_stime) == 0x1C);
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 4*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_addr) == 0x0C);
+
+static_assert(offsetof(siginfo32_t, si_trapno) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_addr_lsb) == 0x10);
+
+static_assert(offsetof(siginfo32_t, si_lower) == 0x14);
+static_assert(offsetof(siginfo32_t, si_upper) == 0x18);
+
+static_assert(offsetof(siginfo32_t, si_pkey) == 0x14);
+
+static_assert(offsetof(siginfo32_t, si_perf_data) == 0x10);
+static_assert(offsetof(siginfo32_t, si_perf_type) == 0x14);
+static_assert(offsetof(siginfo32_t, si_perf_flags) == 0x18);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 2*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_band) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_fd) == 0x10);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 3*sizeof(int));
+static_assert(offsetof(siginfo32_t, si_call_addr) == 0x0C);
+static_assert(offsetof(siginfo32_t, si_syscall) == 0x10);
+static_assert(offsetof(siginfo32_t, si_arch) == 0x14);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
new file mode 100644
index 000000000000..13a1e6083837
--- /dev/null
+++ b/arch/x86/kernel/signal_64.c
@@ -0,0 +1,510 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+#include <asm/ucontext.h>
+#include <asm/fpu/signal.h>
+#include <asm/sighandling.h>
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+
+/*
+ * If regs->ss will cause an IRET fault, change it. Otherwise leave it
+ * alone. Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+ u32 ar;
+ asm volatile ("lar %[old_ss], %[ar]\n\t"
+ "jz 1f\n\t" /* If invalid: */
+ "xorl %[ar], %[ar]\n\t" /* set ar = 0 */
+ "1:"
+ : [ar] "=r" (ar)
+ : [old_ss] "rm" ((u16)regs->ss));
+
+ /*
+ * For a valid 64-bit user context, we need DPL 3, type
+ * read-write data or read-write exp-down data, and S and P
+ * set. We can't use VERW because VERW doesn't check the
+ * P bit.
+ */
+ ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+ if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+ ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+ regs->ss = __USER_DS;
+}
+
+static bool restore_sigcontext(struct pt_regs *regs,
+ struct sigcontext __user *usc,
+ unsigned long uc_flags)
+{
+ struct sigcontext sc;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current->restart_block.fn = do_no_restart_syscall;
+
+ if (copy_from_user(&sc, usc, offsetof(struct sigcontext, reserved1)))
+ return false;
+
+ regs->bx = sc.bx;
+ regs->cx = sc.cx;
+ regs->dx = sc.dx;
+ regs->si = sc.si;
+ regs->di = sc.di;
+ regs->bp = sc.bp;
+ regs->ax = sc.ax;
+ regs->sp = sc.sp;
+ regs->ip = sc.ip;
+ regs->r8 = sc.r8;
+ regs->r9 = sc.r9;
+ regs->r10 = sc.r10;
+ regs->r11 = sc.r11;
+ regs->r12 = sc.r12;
+ regs->r13 = sc.r13;
+ regs->r14 = sc.r14;
+ regs->r15 = sc.r15;
+
+ /* Get CS/SS and force CPL3 */
+ regs->cs = sc.cs | 0x03;
+ regs->ss = sc.ss | 0x03;
+
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (sc.flags & FIX_EFLAGS);
+ /* disable syscall checks */
+ regs->orig_ax = -1;
+
+ /*
+ * Fix up SS if needed for the benefit of old DOSEMU and
+ * CRIU.
+ */
+ if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs)))
+ force_valid_ss(regs);
+
+ return fpu__restore_sig((void __user *)sc.fpstate, 0);
+}
+
+static __always_inline int
+__unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
+ struct pt_regs *regs, unsigned long mask)
+{
+ unsafe_put_user(regs->di, &sc->di, Efault);
+ unsafe_put_user(regs->si, &sc->si, Efault);
+ unsafe_put_user(regs->bp, &sc->bp, Efault);
+ unsafe_put_user(regs->sp, &sc->sp, Efault);
+ unsafe_put_user(regs->bx, &sc->bx, Efault);
+ unsafe_put_user(regs->dx, &sc->dx, Efault);
+ unsafe_put_user(regs->cx, &sc->cx, Efault);
+ unsafe_put_user(regs->ax, &sc->ax, Efault);
+ unsafe_put_user(regs->r8, &sc->r8, Efault);
+ unsafe_put_user(regs->r9, &sc->r9, Efault);
+ unsafe_put_user(regs->r10, &sc->r10, Efault);
+ unsafe_put_user(regs->r11, &sc->r11, Efault);
+ unsafe_put_user(regs->r12, &sc->r12, Efault);
+ unsafe_put_user(regs->r13, &sc->r13, Efault);
+ unsafe_put_user(regs->r14, &sc->r14, Efault);
+ unsafe_put_user(regs->r15, &sc->r15, Efault);
+
+ unsafe_put_user(current->thread.trap_nr, &sc->trapno, Efault);
+ unsafe_put_user(current->thread.error_code, &sc->err, Efault);
+ unsafe_put_user(regs->ip, &sc->ip, Efault);
+ unsafe_put_user(regs->flags, &sc->flags, Efault);
+ unsafe_put_user(regs->cs, &sc->cs, Efault);
+ unsafe_put_user(0, &sc->gs, Efault);
+ unsafe_put_user(0, &sc->fs, Efault);
+ unsafe_put_user(regs->ss, &sc->ss, Efault);
+
+ unsafe_put_user(fpstate, (unsigned long __user *)&sc->fpstate, Efault);
+
+ /* non-iBCS2 extensions.. */
+ unsafe_put_user(mask, &sc->oldmask, Efault);
+ unsafe_put_user(current->thread.cr2, &sc->cr2, Efault);
+ return 0;
+Efault:
+ return -EFAULT;
+}
+
+#define unsafe_put_sigcontext(sc, fp, regs, set, label) \
+do { \
+ if (__unsafe_setup_sigcontext(sc, fp, regs, set->sig[0])) \
+ goto label; \
+} while(0);
+
+#define unsafe_put_sigmask(set, frame, label) \
+ unsafe_put_user(*(__u64 *)(set), \
+ (__u64 __user *)&(frame)->uc.uc_sigmask, \
+ label)
+
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+ unsigned long flags;
+
+ if (boot_cpu_has(X86_FEATURE_XSAVE))
+ flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+ else
+ flags = UC_SIGCONTEXT_SS;
+
+ if (likely(user_64bit_mode(regs)))
+ flags |= UC_STRICT_RESTORE_SS;
+
+ return flags;
+}
+
+int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ sigset_t *set = sigmask_to_save();
+ struct rt_sigframe __user *frame;
+ void __user *fp = NULL;
+ unsigned long uc_flags;
+
+ /* x86-64 should always use SA_RESTORER. */
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+
+ /* Set up to return from userspace. If provided, use a stub
+ already in userspace. */
+ unsafe_put_user(ksig->ka.sa.sa_restorer, &frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->di = ksig->sig;
+ /* In case the signal handler was declared without prototypes */
+ regs->ax = 0;
+
+ /* This also works for non SA_SIGINFO handlers because they expect the
+ next argument after the signal number on the stack. */
+ regs->si = (unsigned long)&frame->info;
+ regs->dx = (unsigned long)&frame->uc;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ regs->sp = (unsigned long)frame;
+
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value. We also have a compatibility issue here: DOSEMU
+ * relies on the contents of the SS register indicating the
+ * SS value at the time of the signal, even though that code in
+ * DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
+ * avoids relying on sigreturn to restore SS; instead it uses
+ * a trampoline.) So we do our best: if the old SS was valid,
+ * we keep it. Otherwise we replace it.
+ */
+ regs->cs = __USER_CS;
+
+ if (unlikely(regs->ss != __USER_DS))
+ force_valid_ss(regs);
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+SYSCALL_DEFINE0(rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(*(__u64 *)&set, (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "rt_sigreturn");
+ return 0;
+}
+
+#ifdef CONFIG_X86_X32_ABI
+static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ struct compat_siginfo new;
+
+ copy_siginfo_to_external32(&new, from);
+ if (from->si_signo == SIGCHLD) {
+ new._sifields._sigchld_x32._utime = from->si_utime;
+ new._sifields._sigchld_x32._stime = from->si_stime;
+ }
+ if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+ return -EFAULT;
+ return 0;
+}
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+ const struct kernel_siginfo *from)
+{
+ if (in_x32_syscall())
+ return x32_copy_siginfo_to_user(to, from);
+ return __copy_siginfo_to_user32(to, from);
+}
+
+int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
+{
+ compat_sigset_t *set = (compat_sigset_t *) sigmask_to_save();
+ struct rt_sigframe_x32 __user *frame;
+ unsigned long uc_flags;
+ void __user *restorer;
+ void __user *fp = NULL;
+
+ if (!(ksig->ka.sa.sa_flags & SA_RESTORER))
+ return -EFAULT;
+
+ frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+
+ uc_flags = frame_uc_flags(regs);
+
+ if (!user_access_begin(frame, sizeof(*frame)))
+ return -EFAULT;
+
+ /* Create the ucontext. */
+ unsafe_put_user(uc_flags, &frame->uc.uc_flags, Efault);
+ unsafe_put_user(0, &frame->uc.uc_link, Efault);
+ unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
+ unsafe_put_user(0, &frame->uc.uc__pad0, Efault);
+ restorer = ksig->ka.sa.sa_restorer;
+ unsafe_put_user(restorer, (unsigned long __user *)&frame->pretcode, Efault);
+ unsafe_put_sigcontext(&frame->uc.uc_mcontext, fp, regs, set, Efault);
+ unsafe_put_sigmask(set, frame, Efault);
+ user_access_end();
+
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
+ return -EFAULT;
+ }
+
+ /* Set up registers for signal handler */
+ regs->sp = (unsigned long) frame;
+ regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
+
+ /* We use the x32 calling convention here... */
+ regs->di = ksig->sig;
+ regs->si = (unsigned long) &frame->info;
+ regs->dx = (unsigned long) &frame->uc;
+
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+
+ return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
+{
+ struct pt_regs *regs = current_pt_regs();
+ struct rt_sigframe_x32 __user *frame;
+ sigset_t set;
+ unsigned long uc_flags;
+
+ frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
+
+ if (!access_ok(frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], (__u64 __user *)&frame->uc.uc_sigmask))
+ goto badframe;
+ if (__get_user(uc_flags, &frame->uc.uc_flags))
+ goto badframe;
+
+ set_current_blocked(&set);
+
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ goto badframe;
+
+ if (compat_restore_altstack(&frame->uc.uc_stack))
+ goto badframe;
+
+ return regs->ax;
+
+badframe:
+ signal_fault(regs, frame, "x32 rt_sigreturn");
+ return 0;
+}
+#endif /* CONFIG_X86_X32_ABI */
+
+#ifdef CONFIG_COMPAT
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ if (!act)
+ return;
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+#endif /* CONFIG_COMPAT */
+
+/*
+* If adding a new si_code, there is probably new data in
+* the siginfo. Make sure folks bumping the si_code
+* limits also have to look at this code. Make sure any
+* new fields are handled in copy_siginfo_to_user32()!
+*/
+static_assert(NSIGILL == 11);
+static_assert(NSIGFPE == 15);
+static_assert(NSIGSEGV == 9);
+static_assert(NSIGBUS == 5);
+static_assert(NSIGTRAP == 6);
+static_assert(NSIGCHLD == 6);
+static_assert(NSIGSYS == 2);
+
+/* This is part of the ABI and can never change in size: */
+static_assert(sizeof(siginfo_t) == 128);
+
+/* This is a part of the ABI and can never change in alignment */
+static_assert(__alignof__(siginfo_t) == 8);
+
+/*
+* The offsets of all the (unioned) si_fields are fixed
+* in the ABI, of course. Make sure none of them ever
+* move and are always at the beginning:
+*/
+static_assert(offsetof(siginfo_t, si_signo) == 0);
+static_assert(offsetof(siginfo_t, si_errno) == 4);
+static_assert(offsetof(siginfo_t, si_code) == 8);
+
+/*
+* Ensure that the size of each si_field never changes.
+* If it does, it is a sign that the
+* copy_siginfo_to_user32() code below needs to updated
+* along with the size in the CHECK_SI_SIZE().
+*
+* We repeat this check for both the generic and compat
+* siginfos.
+*
+* Note: it is OK for these to grow as long as the whole
+* structure stays within the padding size (checked
+* above).
+*/
+
+#define CHECK_SI_OFFSET(name) \
+ static_assert(offsetof(siginfo_t, _sifields) == \
+ offsetof(siginfo_t, _sifields.name))
+#define CHECK_SI_SIZE(name, size) \
+ static_assert(sizeof_field(siginfo_t, _sifields.name) == size)
+
+CHECK_SI_OFFSET(_kill);
+CHECK_SI_SIZE (_kill, 2*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+
+CHECK_SI_OFFSET(_timer);
+CHECK_SI_SIZE (_timer, 6*sizeof(int));
+static_assert(offsetof(siginfo_t, si_tid) == 0x10);
+static_assert(offsetof(siginfo_t, si_overrun) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_rt);
+CHECK_SI_SIZE (_rt, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_value) == 0x18);
+
+CHECK_SI_OFFSET(_sigchld);
+CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_pid) == 0x10);
+static_assert(offsetof(siginfo_t, si_uid) == 0x14);
+static_assert(offsetof(siginfo_t, si_status) == 0x18);
+static_assert(offsetof(siginfo_t, si_utime) == 0x20);
+static_assert(offsetof(siginfo_t, si_stime) == 0x28);
+
+#ifdef CONFIG_X86_X32_ABI
+/* no _sigchld_x32 in the generic siginfo_t */
+static_assert(sizeof_field(compat_siginfo_t, _sifields._sigchld_x32) ==
+ 7*sizeof(int));
+static_assert(offsetof(compat_siginfo_t, _sifields) ==
+ offsetof(compat_siginfo_t, _sifields._sigchld_x32));
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) == 0x18);
+static_assert(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) == 0x20);
+#endif
+
+CHECK_SI_OFFSET(_sigfault);
+CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
+static_assert(offsetof(siginfo_t, si_addr) == 0x10);
+
+static_assert(offsetof(siginfo_t, si_trapno) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18);
+
+static_assert(offsetof(siginfo_t, si_lower) == 0x20);
+static_assert(offsetof(siginfo_t, si_upper) == 0x28);
+
+static_assert(offsetof(siginfo_t, si_pkey) == 0x20);
+
+static_assert(offsetof(siginfo_t, si_perf_data) == 0x18);
+static_assert(offsetof(siginfo_t, si_perf_type) == 0x20);
+static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);
+
+CHECK_SI_OFFSET(_sigpoll);
+CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_band) == 0x10);
+static_assert(offsetof(siginfo_t, si_fd) == 0x18);
+
+CHECK_SI_OFFSET(_sigsys);
+CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
+static_assert(offsetof(siginfo_t, si_call_addr) == 0x10);
+static_assert(offsetof(siginfo_t, si_syscall) == 0x18);
+static_assert(offsetof(siginfo_t, si_arch) == 0x1C);
+
+/* any new si_fields should be added here */
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
deleted file mode 100644
index 879ef8c72f5c..000000000000
--- a/arch/x86/kernel/signal_compat.c
+++ /dev/null
@@ -1,191 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/compat.h>
-#include <linux/uaccess.h>
-#include <linux/ptrace.h>
-
-/*
- * The compat_siginfo_t structure and handing code is very easy
- * to break in several ways. It must always be updated when new
- * updates are made to the main siginfo_t, and
- * copy_siginfo_to_user32() must be updated when the
- * (arch-independent) copy_siginfo_to_user() is updated.
- *
- * It is also easy to put a new member in the compat_siginfo_t
- * which has implicit alignment which can move internal structure
- * alignment around breaking the ABI. This can happen if you,
- * for instance, put a plain 64-bit value in there.
- */
-static inline void signal_compat_build_tests(void)
-{
- int _sifields_offset = offsetof(compat_siginfo_t, _sifields);
-
- /*
- * If adding a new si_code, there is probably new data in
- * the siginfo. Make sure folks bumping the si_code
- * limits also have to look at this code. Make sure any
- * new fields are handled in copy_siginfo_to_user32()!
- */
- BUILD_BUG_ON(NSIGILL != 11);
- BUILD_BUG_ON(NSIGFPE != 15);
- BUILD_BUG_ON(NSIGSEGV != 9);
- BUILD_BUG_ON(NSIGBUS != 5);
- BUILD_BUG_ON(NSIGTRAP != 6);
- BUILD_BUG_ON(NSIGCHLD != 6);
- BUILD_BUG_ON(NSIGSYS != 2);
-
- /* This is part of the ABI and can never change in size: */
- BUILD_BUG_ON(sizeof(siginfo_t) != 128);
- BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
-
- /* This is a part of the ABI and can never change in alignment */
- BUILD_BUG_ON(__alignof__(siginfo_t) != 8);
- BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4);
-
- /*
- * The offsets of all the (unioned) si_fields are fixed
- * in the ABI, of course. Make sure none of them ever
- * move and are always at the beginning:
- */
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int));
-#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name))
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0);
- BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4);
- BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8);
-
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8);
- /*
- * Ensure that the size of each si_field never changes.
- * If it does, it is a sign that the
- * copy_siginfo_to_user32() code below needs to updated
- * along with the size in the CHECK_SI_SIZE().
- *
- * We repeat this check for both the generic and compat
- * siginfos.
- *
- * Note: it is OK for these to grow as long as the whole
- * structure stays within the padding size (checked
- * above).
- */
-#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name))
-#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name))
-
- CHECK_CSI_OFFSET(_kill);
- CHECK_CSI_SIZE (_kill, 2*sizeof(int));
- CHECK_SI_SIZE (_kill, 2*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
-
- CHECK_CSI_OFFSET(_timer);
- CHECK_CSI_SIZE (_timer, 3*sizeof(int));
- CHECK_SI_SIZE (_timer, 6*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
-
- CHECK_CSI_OFFSET(_rt);
- CHECK_CSI_SIZE (_rt, 3*sizeof(int));
- CHECK_SI_SIZE (_rt, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14);
-
- CHECK_CSI_OFFSET(_sigchld);
- CHECK_CSI_SIZE (_sigchld, 5*sizeof(int));
- CHECK_SI_SIZE (_sigchld, 8*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14);
- BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C);
-
-#ifdef CONFIG_X86_X32_ABI
- CHECK_CSI_OFFSET(_sigchld_x32);
- CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int));
- /* no _sigchld_x32 in the generic siginfo_t */
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20);
-#endif
-
- CHECK_CSI_OFFSET(_sigfault);
- CHECK_CSI_SIZE (_sigfault, 4*sizeof(int));
- CHECK_SI_SIZE (_sigfault, 8*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_trapno) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_trapno) != 0x10);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_data) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_type) != 0x20);
- BUILD_BUG_ON(offsetof(siginfo_t, si_perf_flags) != 0x24);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_data) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_type) != 0x14);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf_flags) != 0x18);
-
- CHECK_CSI_OFFSET(_sigpoll);
- CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
- CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10);
-
- CHECK_CSI_OFFSET(_sigsys);
- CHECK_CSI_SIZE (_sigsys, 3*sizeof(int));
- CHECK_SI_SIZE (_sigsys, 4*sizeof(int));
-
- BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10);
- BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18);
- BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10);
- BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14);
-
- /* any new si_fields should be added here */
-}
-
-void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
-{
- signal_compat_build_tests();
-
- if (!act)
- return;
-
- if (in_ia32_syscall())
- act->sa.sa_flags |= SA_IA32_ABI;
- if (in_x32_syscall())
- act->sa.sa_flags |= SA_X32_ABI;
-}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 06db901fabe8..375b33ecafa2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -32,7 +32,7 @@
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
-#include <asm/virtext.h>
+#include <asm/reboot.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
ack_APIC_irq();
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 3f3ea0287f69..352f0ce1ece4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,8 +56,10 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
+#include <linux/stackprotector.h>
#include <asm/acpi.h>
+#include <asm/cacheinfo.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -119,17 +121,20 @@ int arch_update_cpu_topology(void)
return retval;
}
+
+static unsigned int smpboot_warm_reset_vector_count;
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0xa, 0xf);
+ if (!smpboot_warm_reset_vector_count++) {
+ CMOS_WRITE(0xa, 0xf);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
+ }
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
- start_eip >> 4;
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
- start_eip & 0xf;
}
static inline void smpboot_restore_warm_reset_vector(void)
@@ -141,10 +146,12 @@ static inline void smpboot_restore_warm_reset_vector(void)
* to default values.
*/
spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0, 0xf);
+ if (!--smpboot_warm_reset_vector_count) {
+ CMOS_WRITE(0, 0xf);
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+ }
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
/*
@@ -1046,7 +1053,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
- per_cpu(current_task, cpu) = idle;
+ per_cpu(pcpu_hot.current_task, cpu) = idle;
cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
@@ -1056,9 +1063,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
- per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
-#else
- initial_gs = per_cpu_offset(cpu);
+ per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
#endif
return 0;
}
@@ -1084,9 +1089,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
start_ip = real_mode_header->trampoline_start64;
#endif
idle->thread.sp = (unsigned long)task_pt_regs(idle);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
- initial_stack = idle->thread.sp;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_stack = idle->thread.sp;
+ } else {
+ smpboot_control = cpu;
+ }
/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
@@ -1428,8 +1438,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
- set_mtrr_aps_delayed_init();
-
smp_quirk_init_udelay();
speculative_store_bypass_ht_init();
@@ -1439,12 +1447,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
void arch_thaw_secondary_cpus_begin(void)
{
- set_mtrr_aps_delayed_init();
+ set_cache_aps_delayed_init(true);
}
void arch_thaw_secondary_cpus_end(void)
{
- mtrr_aps_init();
+ cache_aps_init();
}
/*
@@ -1453,7 +1461,11 @@ void arch_thaw_secondary_cpus_end(void)
void __init native_smp_prepare_boot_cpu(void)
{
int me = smp_processor_id();
- switch_to_new_gdt(me);
+
+ /* SMP handles this from setup_per_cpu_areas() */
+ if (!IS_ENABLED(CONFIG_SMP))
+ switch_gdt_and_percpu_base(me);
+
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
cpu_set_state_online(me);
@@ -1487,7 +1499,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
nmi_selftest();
impress_friends();
- mtrr_aps_init();
+ cache_aps_init();
}
static int __initdata setup_possible_cpus = -1;
@@ -1812,7 +1824,7 @@ static inline void mwait_play_dead(void)
}
}
-void hlt_play_dead(void)
+void __noreturn hlt_play_dead(void)
{
if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
@@ -1829,7 +1841,7 @@ void native_play_dead(void)
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
- mwait_play_dead(); /* Only returns on failure */
+ mwait_play_dead();
if (cpuidle_play_dead())
hlt_play_dead();
}
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index aaaba85d6d7f..b70670a98597 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -9,6 +9,7 @@ enum insn_type {
NOP = 1, /* site cond-call */
JMP = 2, /* tramp / site tail-call */
RET = 3, /* tramp / site cond-tail-call */
+ JCC = 4,
};
/*
@@ -25,15 +26,44 @@ static const u8 xor5rax[] = { 0x2e, 0x2e, 0x2e, 0x31, 0xc0 };
static const u8 retinsn[] = { RET_INSN_OPCODE, 0xcc, 0xcc, 0xcc, 0xcc };
+static u8 __is_Jcc(u8 *insn) /* Jcc.d32 */
+{
+ u8 ret = 0;
+
+ if (insn[0] == 0x0f) {
+ u8 tmp = insn[1];
+ if ((tmp & 0xf0) == 0x80)
+ ret = tmp;
+ }
+
+ return ret;
+}
+
+extern void __static_call_return(void);
+
+asm (".global __static_call_return\n\t"
+ ".type __static_call_return, @function\n\t"
+ ASM_FUNC_ALIGN "\n\t"
+ "__static_call_return:\n\t"
+ ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ "ret; int3\n\t"
+ ".size __static_call_return, . - __static_call_return \n\t");
+
static void __ref __static_call_transform(void *insn, enum insn_type type,
void *func, bool modinit)
{
const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
+ u8 op, buf[6];
+
+ if ((type == JMP || type == RET) && (op = __is_Jcc(insn)))
+ type = JCC;
switch (type) {
case CALL:
+ func = callthunks_translate_call_dest(func);
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
if (func == &__static_call_return0) {
emulate = code;
@@ -52,10 +82,24 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
case RET:
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
- code = text_gen_insn(JMP32_INSN_OPCODE, insn, &__x86_return_thunk);
+ code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk);
else
code = &retinsn;
break;
+
+ case JCC:
+ if (!func) {
+ func = __static_call_return;
+ if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
+ func = x86_return_thunk;
+ }
+
+ buf[0] = 0x0f;
+ __text_gen_insn(buf+1, op, insn+1, func, 5);
+ code = buf;
+ size = 6;
+
+ break;
}
if (memcmp(insn, code, size) == 0)
@@ -67,9 +111,9 @@ static void __ref __static_call_transform(void *insn, enum insn_type type,
text_poke_bp(insn, code, size, emulate);
}
-static void __static_call_validate(void *insn, bool tail, bool tramp)
+static void __static_call_validate(u8 *insn, bool tail, bool tramp)
{
- u8 opcode = *(u8 *)insn;
+ u8 opcode = insn[0];
if (tramp && memcmp(insn+5, tramp_ud, 3)) {
pr_err("trampoline signature fail");
@@ -78,7 +122,8 @@ static void __static_call_validate(void *insn, bool tail, bool tramp)
if (tail) {
if (opcode == JMP32_INSN_OPCODE ||
- opcode == RET_INSN_OPCODE)
+ opcode == RET_INSN_OPCODE ||
+ __is_Jcc(insn))
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 3c883e064242..3ffbab0081f4 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -12,6 +12,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
+#include <asm/gsseg.h>
#include "tls.h"
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 8617d1ed9d31..1b83377274b8 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -106,7 +106,7 @@ int arch_register_cpu(int num)
* Xen PV guests don't support CPU0 hotplug at all.
*/
if (c->x86_vendor != X86_VENDOR_INTEL ||
- boot_cpu_has(X86_FEATURE_XENPV))
+ cpu_feature_enabled(X86_FEATURE_XENPV))
cpu0_hotpluggable = 0;
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d3fdec706f1d..58b1f208eff5 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -40,7 +40,7 @@
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
-#include <linux/ioasid.h>
+#include <linux/iommu.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
@@ -68,13 +68,13 @@
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
-#include <asm/proto.h>
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-#include <asm/proto.h>
#endif
+#include <asm/proto.h>
+
DECLARE_BITMAP(system_vectors, NR_VECTORS);
static inline void cond_local_irq_enable(struct pt_regs *regs)
@@ -671,15 +671,15 @@ static bool try_fixup_enqcmd_gp(void)
if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
return false;
- pasid = current->mm->pasid;
-
/*
* If the mm has not been allocated a
* PASID, the #GP can not be fixed up.
*/
- if (!pasid_valid(pasid))
+ if (!mm_valid_pasid(current->mm))
return false;
+ pasid = current->mm->pasid;
+
/*
* Did this thread already have its PASID activated?
* If so, the #GP must be from something else.
@@ -858,7 +858,7 @@ DEFINE_IDTENTRY_RAW(exc_int3)
*/
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+ struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
@@ -876,7 +876,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* trust it and switch to the current kernel stack
*/
if (ip_within_syscall_gap(regs)) {
- sp = this_cpu_read(cpu_current_top_of_stack);
+ sp = this_cpu_read(pcpu_hot.top_of_stack);
goto sync;
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cafacb2e58cc..344698852146 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -48,10 +48,12 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
+static int __read_mostly tsc_force_recalibrate;
+
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
-struct clocksource *art_related_clocksource;
+static struct clocksource *art_related_clocksource;
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
@@ -215,7 +217,7 @@ static void __init cyc2ns_init_secondary_cpus(void)
/*
* Scheduler clock - returns current time in nanosec units.
*/
-u64 native_sched_clock(void)
+noinstr u64 native_sched_clock(void)
{
if (static_branch_likely(&__use_tsc)) {
u64 tsc_now = rdtsc();
@@ -248,7 +250,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
/* We need to define a real function for sched_clock, to override the
weak default version */
#ifdef CONFIG_PARAVIRT
-unsigned long long sched_clock(void)
+noinstr u64 sched_clock(void)
{
return paravirt_sched_clock();
}
@@ -258,8 +260,7 @@ bool using_native_sched_clock(void)
return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
-unsigned long long
-sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
bool using_native_sched_clock(void) { return true; }
#endif
@@ -292,6 +293,7 @@ __setup("notsc", notsc_setup);
static int no_sched_irq_time;
static int no_tsc_watchdog;
+static int tsc_as_watchdog;
static int __init tsc_setup(char *str)
{
@@ -301,8 +303,22 @@ static int __init tsc_setup(char *str)
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
- if (!strcmp(str, "nowatchdog"))
+ if (!strcmp(str, "nowatchdog")) {
no_tsc_watchdog = 1;
+ if (tsc_as_watchdog)
+ pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
+ __func__);
+ tsc_as_watchdog = 0;
+ }
+ if (!strcmp(str, "recalibrate"))
+ tsc_force_recalibrate = 1;
+ if (!strcmp(str, "watchdog")) {
+ if (no_tsc_watchdog)
+ pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
+ __func__);
+ else
+ tsc_as_watchdog = 1;
+ }
return 1;
}
@@ -912,8 +928,7 @@ void recalibrate_cpu_khz(void)
cpu_khz_old, cpu_khz);
#endif
}
-
-EXPORT_SYMBOL(recalibrate_cpu_khz);
+EXPORT_SYMBOL_GPL(recalibrate_cpu_khz);
static unsigned long long cyc2ns_suspend;
@@ -1186,6 +1201,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
+ tsc_as_watchdog && !no_tsc_watchdog;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1374,6 +1395,25 @@ restart:
else
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+ /* Will hit this only if tsc_force_recalibrate has been set */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+
+ /* Warn if the deviation exceeds 500 ppm */
+ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
+ pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
+ pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+ }
+
+ pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
+ hpet ? "HPET" : "PM_TIMER",
+ (unsigned long)freq / 1000,
+ (unsigned long)freq % 1000);
+
+ return;
+ }
+
/* Make sure we're within 1% */
if (abs(tsc_khz - freq) > tsc_khz/100)
goto out;
@@ -1407,8 +1447,10 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;
- if (tsc_unstable)
- goto unreg;
+ if (tsc_unstable) {
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1421,9 +1463,10 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
-unreg:
clocksource_unregister(&clocksource_tsc_early);
- return 0;
+
+ if (!tsc_force_recalibrate)
+ return 0;
}
schedule_delayed_work(&tsc_irqwork, 0);
@@ -1510,6 +1553,11 @@ void __init tsc_early_init(void)
void __init tsc_init(void)
{
+ if (!cpu_feature_enabled(X86_FEATURE_TSC)) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
+ return;
+ }
+
/*
* native_calibrate_cpu_early can only calibrate using methods that are
* available early in boot.
@@ -1517,11 +1565,6 @@ void __init tsc_init(void)
if (x86_platform.calibrate_cpu == native_calibrate_cpu_early)
x86_platform.calibrate_cpu = native_calibrate_cpu;
- if (!boot_cpu_has(X86_FEATURE_TSC)) {
- setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
- return;
- }
-
if (!tsc_khz) {
/* We failed to determine frequencies earlier, try again */
if (!determine_cpu_tsc_frequencies(false)) {
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index c059820dfaea..4d8e518365f4 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -7,6 +7,9 @@
#include <asm/unwind.h>
#include <asm/orc_types.h>
#include <asm/orc_lookup.h>
+#include <asm/orc_header.h>
+
+ORC_HEADER;
#define orc_warn(fmt, ...) \
printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
@@ -133,17 +136,31 @@ static struct orc_entry null_orc_entry = {
.sp_offset = sizeof(long),
.sp_reg = ORC_REG_SP,
.bp_reg = ORC_REG_UNDEFINED,
- .type = UNWIND_HINT_TYPE_CALL
+ .type = ORC_TYPE_CALL
};
+#ifdef CONFIG_CALL_THUNKS
+static struct orc_entry *orc_callthunk_find(unsigned long ip)
+{
+ if (!is_callthunk((void *)ip))
+ return NULL;
+
+ return &null_orc_entry;
+}
+#else
+static struct orc_entry *orc_callthunk_find(unsigned long ip)
+{
+ return NULL;
+}
+#endif
+
/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
- .type = UNWIND_HINT_TYPE_CALL,
+ .type = ORC_TYPE_CALL,
.sp_reg = ORC_REG_BP,
.sp_offset = 16,
.bp_reg = ORC_REG_PREV_SP,
.bp_offset = -16,
- .end = 0,
};
static struct orc_entry *orc_find(unsigned long ip)
@@ -189,7 +206,11 @@ static struct orc_entry *orc_find(unsigned long ip)
if (orc)
return orc;
- return orc_ftrace_find(ip);
+ orc = orc_ftrace_find(ip);
+ if (orc)
+ return orc;
+
+ return orc_callthunk_find(ip);
}
#ifdef CONFIG_MODULES
@@ -231,13 +252,13 @@ static int orc_sort_cmp(const void *_a, const void *_b)
return -1;
/*
- * The "weak" section terminator entries need to always be on the left
+ * The "weak" section terminator entries need to always be first
* to ensure the lookup code skips them in favor of real entries.
* These terminator entries exist to handle any gaps created by
* whitelisted .o files which didn't get objtool generation.
*/
orc_a = cur_orc_table + (a - cur_orc_ip_table);
- return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+ return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
}
void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
@@ -455,16 +476,16 @@ bool unwind_next_frame(struct unwind_state *state)
*/
orc = &orc_fp_entry;
state->error = true;
- }
-
- /* End-of-stack check for kernel threads: */
- if (orc->sp_reg == ORC_REG_UNDEFINED) {
- if (!orc->end)
+ } else {
+ if (orc->type == ORC_TYPE_UNDEFINED)
goto err;
- goto the_end;
+ if (orc->type == ORC_TYPE_END_OF_STACK)
+ goto the_end;
}
+ state->signal = orc->signal;
+
/* Find the previous frame's stack: */
switch (orc->sp_reg) {
case ORC_REG_SP:
@@ -533,7 +554,7 @@ bool unwind_next_frame(struct unwind_state *state)
/* Find IP, SP and possibly regs: */
switch (orc->type) {
- case UNWIND_HINT_TYPE_CALL:
+ case ORC_TYPE_CALL:
ip_p = sp - sizeof(long);
if (!deref_stack_reg(state, ip_p, &state->ip))
@@ -544,10 +565,9 @@ bool unwind_next_frame(struct unwind_state *state)
state->sp = sp;
state->regs = NULL;
state->prev_regs = NULL;
- state->signal = false;
break;
- case UNWIND_HINT_TYPE_REGS:
+ case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn_current("can't access registers at %pB\n",
(void *)orig_ip);
@@ -568,16 +588,15 @@ bool unwind_next_frame(struct unwind_state *state)
state->regs = (struct pt_regs *)sp;
state->prev_regs = NULL;
state->full_regs = true;
- state->signal = true;
break;
- case UNWIND_HINT_TYPE_REGS_PARTIAL:
+ case ORC_TYPE_REGS_PARTIAL:
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn_current("can't access iret registers at %pB\n",
(void *)orig_ip);
goto err;
}
- /* See UNWIND_HINT_TYPE_REGS case comment. */
+ /* See ORC_TYPE_REGS case comment. */
state->ip = unwind_recover_rethook(state, state->ip,
(unsigned long *)(state->sp - sizeof(long)));
@@ -585,7 +604,6 @@ bool unwind_next_frame(struct unwind_state *state)
state->prev_regs = state->regs;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
- state->signal = true;
break;
default:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index b63cf8f7745e..6c07f6daaa22 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -722,8 +722,9 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
switch (opc1) {
case 0xeb: /* jmp 8 */
case 0xe9: /* jmp 32 */
- case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
break;
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ goto setup;
case 0xe8: /* call relative */
branch_clear_offset(auprobe, insn);
@@ -753,6 +754,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
+setup:
auprobe->branch.opc1 = opc1;
auprobe->branch.ilen = insn->length;
auprobe->branch.offs = insn->immediate.value;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 15f29053cec4..25f155205770 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -129,21 +129,21 @@ SECTIONS
HEAD_TEXT
TEXT_TEXT
SCHED_TEXT
- CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
- ALIGN_ENTRY_TEXT_BEGIN
- ENTRY_TEXT
- ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
- STATIC_CALL_TEXT
- *(.gnu.warning)
-
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
*(.text.__x86.*)
__indirect_thunk_end = .;
#endif
+ STATIC_CALL_TEXT
+
+ ALIGN_ENTRY_TEXT_BEGIN
+ ENTRY_TEXT
+ ALIGN_ENTRY_TEXT_END
+ *(.gnu.warning)
+
} :text =0xcccc
/* End of text section, which should occupy whole number of pages */
@@ -290,6 +290,13 @@ SECTIONS
*(.return_sites)
__return_sites_end = .;
}
+
+ . = ALIGN(8);
+ .call_sites : AT(ADDR(.call_sites) - LOAD_OFFSET) {
+ __call_sites = .;
+ *(.call_sites)
+ __call_sites_end = .;
+ }
#endif
#ifdef CONFIG_X86_KERNEL_IBT
@@ -301,6 +308,15 @@ SECTIONS
}
#endif
+#ifdef CONFIG_FINEIBT
+ . = ALIGN(8);
+ .cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
+ __cfi_sites = .;
+ *(.cfi_sites)
+ __cfi_sites_end = .;
+ }
+#endif
+
/*
* struct alt_inst entries. From the header (alternative.h):
* "Alternative instructions for different CPU types or capabilities"
@@ -493,11 +509,3 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif
#endif /* CONFIG_X86_64 */
-
-#ifdef CONFIG_KEXEC_CORE
-#include <asm/kexec.h>
-
-. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big");
-#endif
-
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 57353519bc11..d82f4fa2f1bf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -25,6 +25,7 @@
#include <asm/iommu.h>
#include <asm/mach_traps.h>
#include <asm/irqdomain.h>
+#include <asm/realmode.h>
void x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
@@ -32,8 +33,8 @@ static int __init iommu_init_noop(void) { return 0; }
static void iommu_shutdown_noop(void) { }
bool __init bool_x86_init_noop(void) { return false; }
void x86_op_int_noop(int cpu) { }
-static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
-static __init void get_rtc_noop(struct timespec64 *now) { }
+int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; }
+void get_rtc_noop(struct timespec64 *now) { }
static __initconst const struct of_device_id of_cmos_match[] = {
{ .compatible = "motorola,mc146818" },
@@ -133,6 +134,7 @@ static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu_early,
@@ -145,7 +147,10 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.get_nmi_reason = default_get_nmi_reason,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
+ .realmode_reserve = reserve_real_mode,
+ .realmode_init = init_real_mode,
.hyper.pin_vcpu = x86_op_int_noop,
+ .hyper.is_private_mmio = is_private_mmio_noop,
.guest = {
.enc_status_change_prepare = enc_status_change_prepare_noop,
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 67be7f217e37..89ca7f4c1464 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -46,9 +46,9 @@ config KVM
select KVM_XFER_TO_GUEST_WORK
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
- select SRCU
select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -118,6 +118,17 @@ config KVM_AMD_SEV
Provides support for launching Encrypted VMs (SEV) and Encrypted VMs
with Encrypted State (SEV-ES) on AMD processors.
+config KVM_SMM
+ bool "System Management Mode emulation"
+ default y
+ depends on KVM
+ help
+ Provides support for KVM to emulate System Management Mode (SMM)
+ in virtual machines. This can be used by the virtual machine
+ firmware to implement UEFI secure boot.
+
+ If unsure, say Y.
+
config KVM_XEN
bool "Support for Xen hypercall interface"
depends on KVM
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f453a0f96e24..80e3fe184d17 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -20,12 +20,14 @@ endif
kvm-$(CONFIG_X86_64) += mmu/tdp_iter.o mmu/tdp_mmu.o
kvm-$(CONFIG_KVM_XEN) += xen.o
+kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/evmcs.o vmx/nested.o vmx/posted_intr.o
+ vmx/hyperv.o vmx/nested.o vmx/posted_intr.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o svm/sev.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
+ svm/sev.o svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-amd-y += svm/svm_onhyperv.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 62bc7a01cecc..0c9660a07b23 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -8,6 +8,7 @@
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
* Copyright IBM Corporation, 2008
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/export.h>
@@ -25,6 +26,7 @@
#include "mmu.h"
#include "trace.h"
#include "pmu.h"
+#include "xen.h"
/*
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
@@ -58,14 +60,14 @@ u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
-/*
- * This one is tied to SSB in the user API, and not
- * visible in /proc/cpuinfo.
- */
-#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
-
#define F feature_bit
-#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
+/* Scattered Flag - For features that are scattered by cpufeatures.h. */
+#define SF(name) \
+({ \
+ BUILD_BUG_ON(X86_FEATURE_##name >= MAX_CPU_FEATURES); \
+ (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0); \
+})
/*
* Magic value used by KVM when querying userspace-provided CPUID entries and
@@ -174,15 +176,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
return 0;
}
-static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
{
- u32 function;
+ struct kvm_hypervisor_cpuid cpuid = {};
struct kvm_cpuid_entry2 *entry;
+ u32 base;
- vcpu->arch.kvm_cpuid_base = 0;
-
- for_each_possible_hypervisor_cpuid_base(function) {
- entry = kvm_find_cpuid_entry(vcpu, function);
+ for_each_possible_hypervisor_cpuid_base(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
if (entry) {
u32 signature[3];
@@ -191,19 +193,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
signature[1] = entry->ecx;
signature[2] = entry->edx;
- BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
- if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
- vcpu->arch.kvm_cpuid_base = function;
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
break;
}
}
}
+
+ return cpuid;
}
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{
- u32 base = vcpu->arch.kvm_cpuid_base;
+ u32 base = vcpu->arch.kvm_cpuid.base;
if (!base)
return NULL;
@@ -249,14 +253,13 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
int nent)
{
struct kvm_cpuid_entry2 *best;
- u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
best = cpuid_entry2_find(entries, nent, 1, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
cpuid_entry_change(best, X86_FEATURE_OSXSAVE,
- kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE));
cpuid_entry_change(best, X86_FEATURE_APIC,
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
@@ -265,7 +268,7 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE));
best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
@@ -288,21 +291,6 @@ static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_e
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
-
- /*
- * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
- * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
- * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
- * at the time of EENTER, thus adjust the allowed XFRM by the guest's
- * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
- * '1' even on CPUs that don't support XSAVE.
- */
- best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
- if (best) {
- best->ecx &= guest_supported_xcr0 & 0xffffffff;
- best->edx &= guest_supported_xcr0 >> 32;
- best->ecx |= XFEATURE_MASK_FPSSE;
- }
}
void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
@@ -410,7 +398,7 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
* KVM_SET_CPUID{,2} again. To support this legacy behavior, check
* whether the supplied CPUID data is equal to what's already set.
*/
- if (vcpu->arch.last_vmentry_cpu != -1) {
+ if (kvm_vcpu_has_run(vcpu)) {
r = kvm_cpuid_check_equal(vcpu, e2, nent);
if (r)
return r;
@@ -433,7 +421,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_entries = e2;
vcpu->arch.cpuid_nent = nent;
- kvm_update_kvm_cpuid_base(vcpu);
+ vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -543,9 +532,9 @@ static __always_inline void __kvm_cpu_cap_mask(unsigned int leaf)
}
static __always_inline
-void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
+void kvm_cpu_cap_init_kvm_defined(enum kvm_only_cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_mask for non-scattered leafs. */
+ /* Use kvm_cpu_cap_mask for leafs that aren't KVM-only. */
BUILD_BUG_ON(leaf < NCAPINTS);
kvm_cpu_caps[leaf] = mask;
@@ -555,7 +544,7 @@ void kvm_cpu_cap_init_scattered(enum kvm_only_cpuid_leafs leaf, u32 mask)
static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask)
{
- /* Use kvm_cpu_cap_init_scattered for scattered leafs. */
+ /* Use kvm_cpu_cap_init_kvm_defined for KVM-only leafs. */
BUILD_BUG_ON(leaf >= NCAPINTS);
kvm_cpu_caps[leaf] &= mask;
@@ -642,7 +631,7 @@ void kvm_set_cpu_caps(void)
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) |
F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) |
- F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16)
+ F(AMX_TILE) | F(AMX_INT8) | F(AMX_BF16) | F(FLUSH_L1D)
);
/* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */
@@ -657,15 +646,21 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX_VNNI) | F(AVX512_BF16)
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
+ F(FZRM) | F(FSRS) | F(FSRC) |
+ F(AMX_FP16) | F(AVX_IFMA)
+ );
+
+ kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
+ F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI)
);
kvm_cpu_cap_mask(CPUID_D_1_EAX,
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) | f_xfd
);
- kvm_cpu_cap_init_scattered(CPUID_12_EAX,
- SF(SGX1) | SF(SGX2)
+ kvm_cpu_cap_init_kvm_defined(CPUID_12_EAX,
+ SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
@@ -690,11 +685,15 @@ void kvm_set_cpu_caps(void)
if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX,
+ SF(CONSTANT_TSC)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
- __feature_bit(KVM_X86_FEATURE_PSFD)
+ F(AMD_PSFD)
);
/*
@@ -730,6 +729,27 @@ void kvm_set_cpu_caps(void)
0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
F(SME_COHERENT));
+ kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
+ F(NO_NESTED_DATA_BP) | F(LFENCE_RDTSC) | 0 /* SmmPgCfgLock */ |
+ F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */
+ );
+
+ /*
+ * Synthesize "LFENCE is serializing" into the AMD-defined entry in
+ * KVM's supported CPUID if the feature is reported as supported by the
+ * kernel. LFENCE_RDTSC was a Linux-defined synthetic feature long
+ * before AMD joined the bandwagon, e.g. LFENCE is serializing on most
+ * CPUs that support SSE2. On CPUs that don't support AMD's leaf,
+ * kvm_cpu_cap_mask() will unfortunately drop the flag due to ANDing
+ * the mask with the raw host CPUID, and reporting support in AMD's
+ * leaf can make it easier for userspace to detect the feature.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ kvm_cpu_cap_set(X86_FEATURE_LFENCE_RDTSC);
+ if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
+ kvm_cpu_cap_set(X86_FEATURE_NULL_SEL_CLR_BASE);
+ kvm_cpu_cap_set(X86_FEATURE_NO_SMM_CTL_MSR);
+
kvm_cpu_cap_mask(CPUID_C000_0001_EDX,
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
@@ -759,16 +779,22 @@ struct kvm_cpuid_array {
int nent;
};
+static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array)
+{
+ if (array->nent >= array->maxnent)
+ return NULL;
+
+ return &array->entries[array->nent++];
+}
+
static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array,
u32 function, u32 index)
{
- struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *entry = get_next_cpuid(array);
- if (array->nent >= array->maxnent)
+ if (!entry)
return NULL;
- entry = &array->entries[array->nent++];
-
memset(entry, 0, sizeof(*entry));
entry->function = function;
entry->index = index;
@@ -913,9 +939,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
goto out;
cpuid_entry_override(entry, CPUID_7_1_EAX);
+ cpuid_entry_override(entry, CPUID_7_1_EDX);
entry->ebx = 0;
entry->ecx = 0;
- entry->edx = 0;
}
break;
case 0xa: { /* Architectural Performance Monitoring */
@@ -945,25 +971,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = edx.full;
break;
}
- /*
- * Per Intel's SDM, the 0x1f is a superset of 0xb,
- * thus they can be handled by common code.
- */
case 0x1f:
case 0xb:
/*
- * Populate entries until the level type (ECX[15:8]) of the
- * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is
- * the starting entry, filled by the primary do_host_cpuid().
+ * No topology; a valid topology is indicated by the presence
+ * of subleaf 1.
*/
- for (i = 1; entry->ecx & 0xff00; ++i) {
- entry = do_host_cpuid(array, function, i);
- if (!entry)
- goto out;
- }
+ entry->eax = entry->ebx = entry->ecx = 0;
break;
case 0xd: {
- u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xcr0 = kvm_get_filtered_xcr0();
u64 permitted_xss = kvm_caps.supported_xss;
entry->eax &= permitted_xcr0;
@@ -1047,9 +1064,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* userspace. ATTRIBUTES.XFRM is not adjusted as userspace is
* expected to derive it from supported XCR0.
*/
- entry->eax &= SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT |
- SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY |
- SGX_ATTR_KSS;
+ entry->eax &= SGX_ATTR_PRIV_MASK | SGX_ATTR_UNPRIV_MASK;
entry->ebx &= 0;
break;
/* Intel PT */
@@ -1142,8 +1157,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx &= ~GENMASK(17, 16);
break;
case 0x80000007: /* Advanced power management */
- /* invariant TSC is CPUID.80000007H:EDX[8] */
- entry->edx &= (1 << 8);
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
/* mask against host */
entry->edx &= boot_cpu_data.x86_power;
entry->eax = entry->ebx = entry->ecx = 0;
@@ -1193,6 +1208,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->ebx = entry->ecx = entry->edx = 0;
break;
case 0x8000001e:
+ /* Do not return host topology information. */
+ entry->eax = entry->ebx = entry->ecx = 0;
+ entry->edx = 0; /* reserved */
break;
case 0x8000001F:
if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) {
@@ -1213,21 +1231,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
case 0x80000021:
entry->ebx = entry->ecx = entry->edx = 0;
- /*
- * Pass down these bits:
- * EAX 0 NNDBP, Processor ignores nested data breakpoints
- * EAX 2 LAS, LFENCE always serializing
- * EAX 6 NSCB, Null selector clear base
- *
- * Other defined bits are for MSRs that KVM does not expose:
- * EAX 3 SPCL, SMM page configuration lock
- * EAX 13 PCMSR, Prefetch control MSR
- */
- entry->eax &= BIT(0) | BIT(2) | BIT(6);
- if (static_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- entry->eax |= BIT(2);
- if (!static_cpu_has_bug(X86_BUG_NULL_SEG))
- entry->eax |= BIT(6);
+ cpuid_entry_override(entry, CPUID_8000_0021_EAX);
break;
/*Add support for Centaur's CPUID instruction*/
case 0xC0000000:
@@ -1469,6 +1473,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
*ebx &= ~(F(RTM) | F(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~SF(CONSTANT_TSC);
}
} else {
*eax = *ebx = *ecx = *edx = 0;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c1390357126a..ee8c4c3496ed 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -4,6 +4,8 @@
*
* Copyright 2016 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4a43261d25a2..936a397a08cd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -17,6 +17,7 @@
*
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
@@ -242,37 +243,6 @@ enum x86_transfer_type {
X86_TRANSFER_TASK_SWITCH,
};
-static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- if (!(ctxt->regs_valid & (1 << nr))) {
- ctxt->regs_valid |= 1 << nr;
- ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
- }
- return ctxt->_regs[nr];
-}
-
-static ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
- nr &= NR_EMULATOR_GPRS - 1;
-
- BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
- BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
-
- ctxt->regs_valid |= 1 << nr;
- ctxt->regs_dirty |= 1 << nr;
- return &ctxt->_regs[nr];
-}
-
-static ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
-{
- reg_read(ctxt, nr);
- return reg_write(ctxt, nr);
-}
-
static void writeback_registers(struct x86_emulate_ctxt *ctxt)
{
unsigned long dirty = ctxt->regs_dirty;
@@ -1664,12 +1634,20 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_SS:
/*
* segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
+ * selector's RPL != CPL or DPL != CPL
*/
if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
goto exception;
break;
case VCPU_SREG_CS:
+ /*
+ * KVM uses "none" when loading CS as part of emulating Real
+ * Mode exceptions and IRET (handled above). In all other
+ * cases, loading CS without a control transfer is a KVM bug.
+ */
+ if (WARN_ON_ONCE(transfer == X86_TRANSFER_NONE))
+ goto exception;
+
if (!(seg_desc.type & 8))
goto exception;
@@ -1726,11 +1704,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/*
* segment is not a data or readable code segment or
* ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
+ * and ((RPL > DPL) or (CPL > DPL)))
*/
if ((seg_desc.type & 0xa) == 0x8 ||
(((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
+ (rpl > dpl || cpl > dpl)))
goto exception;
break;
}
@@ -2338,335 +2316,15 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
-{
-#ifdef CONFIG_X86_64
- return ctxt->ops->guest_has_long_mode(ctxt);
-#else
- return false;
-#endif
-}
-
-static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
-{
- desc->g = (flags >> 23) & 1;
- desc->d = (flags >> 22) & 1;
- desc->l = (flags >> 21) & 1;
- desc->avl = (flags >> 20) & 1;
- desc->p = (flags >> 15) & 1;
- desc->dpl = (flags >> 13) & 3;
- desc->s = (flags >> 12) & 1;
- desc->type = (flags >> 8) & 15;
-}
-
-static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
- else
- offset = 0x7f2c + (n - 3) * 12;
-
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
- return X86EMUL_CONTINUE;
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
- int n)
-{
- struct desc_struct desc;
- int offset;
- u16 selector;
- u32 base3;
-
- offset = 0x7e00 + n * 16;
-
- selector = GET_SMSTATE(u16, smstate, offset);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8));
- base3 = GET_SMSTATE(u32, smstate, offset + 12);
-
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
- return X86EMUL_CONTINUE;
-}
-#endif
-
-static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
- u64 cr0, u64 cr3, u64 cr4)
-{
- int bad;
- u64 pcid;
-
- /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
- pcid = 0;
- if (cr4 & X86_CR4_PCIDE) {
- pcid = cr3 & 0xfff;
- cr3 &= ~0xfff;
- }
-
- bad = ctxt->ops->set_cr(ctxt, 3, cr3);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- /*
- * First enable PAE, long mode needs it before CR0.PG = 1 is set.
- * Then enable protected mode. However, PCID cannot be enabled
- * if EFER.LMA=0, so set it separately.
- */
- bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- bad = ctxt->ops->set_cr(ctxt, 0, cr0);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
-
- if (cr4 & X86_CR4_PCIDE) {
- bad = ctxt->ops->set_cr(ctxt, 4, cr4);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- if (pcid) {
- bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
- if (bad)
- return X86EMUL_UNHANDLEABLE;
- }
-
- }
-
- return X86EMUL_CONTINUE;
-}
-
-static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u16 selector;
- u32 val, cr0, cr3, cr4;
- int i;
-
- cr0 = GET_SMSTATE(u32, smstate, 0x7ffc);
- cr3 = GET_SMSTATE(u32, smstate, 0x7ff8);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED;
- ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0);
-
- for (i = 0; i < 8; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4);
-
- val = GET_SMSTATE(u32, smstate, 0x7fcc);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u32, smstate, 0x7fc8);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc4);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
-
- selector = GET_SMSTATE(u32, smstate, 0x7fc0);
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80));
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c));
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78));
- ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f74);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f70);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- dt.address = GET_SMSTATE(u32, smstate, 0x7f58);
- dt.size = GET_SMSTATE(u32, smstate, 0x7f54);
- ctxt->ops->set_idt(ctxt, &dt);
-
- for (i = 0; i < 6; i++) {
- int r = rsm_load_seg_32(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- cr4 = GET_SMSTATE(u32, smstate, 0x7f14);
-
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8));
-
- return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
-}
-
-#ifdef CONFIG_X86_64
-static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
-{
- struct desc_struct desc;
- struct desc_ptr dt;
- u64 val, cr0, cr3, cr4;
- u32 base3;
- u16 selector;
- int i, r;
-
- for (i = 0; i < 16; i++)
- *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8);
-
- ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78);
- ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED;
-
- val = GET_SMSTATE(u64, smstate, 0x7f68);
-
- if (ctxt->ops->set_dr(ctxt, 6, val))
- return X86EMUL_UNHANDLEABLE;
-
- val = GET_SMSTATE(u64, smstate, 0x7f60);
-
- if (ctxt->ops->set_dr(ctxt, 7, val))
- return X86EMUL_UNHANDLEABLE;
-
- cr0 = GET_SMSTATE(u64, smstate, 0x7f58);
- cr3 = GET_SMSTATE(u64, smstate, 0x7f50);
- cr4 = GET_SMSTATE(u64, smstate, 0x7f48);
- ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00));
- val = GET_SMSTATE(u64, smstate, 0x7ed0);
-
- if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA))
- return X86EMUL_UNHANDLEABLE;
-
- selector = GET_SMSTATE(u32, smstate, 0x7e90);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98));
- base3 = GET_SMSTATE(u32, smstate, 0x7e9c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e84);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e88);
- ctxt->ops->set_idt(ctxt, &dt);
-
- selector = GET_SMSTATE(u32, smstate, 0x7e70);
- rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8);
- set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74));
- set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78));
- base3 = GET_SMSTATE(u32, smstate, 0x7e7c);
- ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
-
- dt.size = GET_SMSTATE(u32, smstate, 0x7e64);
- dt.address = GET_SMSTATE(u64, smstate, 0x7e68);
- ctxt->ops->set_gdt(ctxt, &dt);
-
- r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
- if (r != X86EMUL_CONTINUE)
- return r;
-
- for (i = 0; i < 6; i++) {
- r = rsm_load_seg_64(ctxt, smstate, i);
- if (r != X86EMUL_CONTINUE)
- return r;
- }
-
- return X86EMUL_CONTINUE;
-}
-#endif
-
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- unsigned long cr0, cr4, efer;
- char buf[512];
- u64 smbase;
- int ret;
-
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
+ if (!ctxt->ops->is_smm(ctxt))
return emulate_ud(ctxt);
- smbase = ctxt->ops->get_smbase(ctxt);
-
- ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf));
- if (ret != X86EMUL_CONTINUE)
- return X86EMUL_UNHANDLEABLE;
+ if (ctxt->ops->leave_smm(ctxt))
+ ctxt->ops->triple_fault(ctxt);
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
- ctxt->ops->set_nmi_mask(ctxt, false);
-
- ctxt->ops->exiting_smm(ctxt);
-
- /*
- * Get back to real mode, to prepare a safe state in which to load
- * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
- * supports long mode.
- */
- if (emulator_has_longmode(ctxt)) {
- struct desc_struct cs_desc;
-
- /* Zero CR4.PCIDE before CR0.PG. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PCIDE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
-
- /* A 32-bit code segment is required to clear EFER.LMA. */
- memset(&cs_desc, 0, sizeof(cs_desc));
- cs_desc.type = 0xb;
- cs_desc.s = cs_desc.g = cs_desc.p = 1;
- ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
- }
-
- /* For the 64-bit case, this will clear EFER.LMA. */
- cr0 = ctxt->ops->get_cr(ctxt, 0);
- if (cr0 & X86_CR0_PE)
- ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-
- if (emulator_has_longmode(ctxt)) {
- /* Clear CR4.PAE before clearing EFER.LME. */
- cr4 = ctxt->ops->get_cr(ctxt, 4);
- if (cr4 & X86_CR4_PAE)
- ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
-
- /* And finally go back to 32-bit mode. */
- efer = 0;
- ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
- }
-
- /*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
- */
- if (ctxt->ops->leave_smm(ctxt, buf))
- goto emulate_shutdown;
-
-#ifdef CONFIG_X86_64
- if (emulator_has_longmode(ctxt))
- ret = rsm_load_state_64(ctxt, buf);
- else
-#endif
- ret = rsm_load_state_32(ctxt, buf);
-
- if (ret != X86EMUL_CONTINUE)
- goto emulate_shutdown;
-
- /*
- * Note, the ctxt->ops callbacks are responsible for handling side
- * effects when writing MSRs and CRs, e.g. MMU context resets, CPUID
- * runtime updates, etc... If that changes, e.g. this flow is moved
- * out of the emulator to make it look more like enter_smm(), then
- * those side effects need to be explicitly handled for both success
- * and shutdown.
- */
return emulator_recalc_and_set_mode(ctxt);
-
-emulate_shutdown:
- ctxt->ops->triple_fault(ctxt);
- return X86EMUL_CONTINUE;
}
static void
@@ -2966,8 +2624,8 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
return true;
}
-static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
- u16 port, u16 len)
+static bool emulator_io_permitted(struct x86_emulate_ctxt *ctxt,
+ u16 port, u16 len)
{
if (ctxt->perm_ok)
return true;
@@ -4312,7 +3970,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
static int check_perm_in(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
+ if (!emulator_io_permitted(ctxt, ctxt->src.val, ctxt->dst.bytes))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -4321,7 +3979,7 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
static int check_perm_out(struct x86_emulate_ctxt *ctxt)
{
ctxt->src.bytes = min(ctxt->src.bytes, 4u);
- if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
+ if (!emulator_io_permitted(ctxt, ctxt->dst.val, ctxt->src.bytes))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -5483,7 +5141,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
- unsigned emul_flags;
+ bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt);
ctxt->mem_read.pos = 0;
@@ -5498,7 +5156,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- emul_flags = ctxt->ops->get_hflags(ctxt);
if (unlikely(ctxt->d &
(No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5532,7 +5189,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(&ctxt->dst);
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ if (unlikely(is_guest_mode) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5561,7 +5218,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5615,7 +5272,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 0adf4a437e85..b28fd020066f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -17,28 +17,50 @@
* Ben-Ami Yassour <benami@il.ibm.com>
* Andrey Smetanin <asmetanin@virtuozzo.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "lapic.h"
#include "ioapic.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "mmu.h"
#include "xen.h"
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/sched/cputime.h>
+#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <asm/apicdef.h>
+#include <asm/mshyperv.h>
#include <trace/events/kvm.h>
#include "trace.h"
#include "irq.h"
#include "fpu.h"
-#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
+#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick);
@@ -897,13 +919,15 @@ bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page)
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu)
{
- if (!kvm_hv_assist_page_enabled(vcpu))
- return false;
- return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
- assist_page, sizeof(*assist_page));
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu || !kvm_hv_assist_page_enabled(vcpu))
+ return -EFAULT;
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ &hv_vcpu->vp_assist_page, sizeof(struct hv_vp_assist_page));
}
EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page);
@@ -954,6 +978,11 @@ int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
hv_vcpu->vp_index = vcpu->vcpu_idx;
+ for (i = 0; i < HV_NR_TLB_FLUSH_FIFOS; i++) {
+ INIT_KFIFO(hv_vcpu->tlb_flush_fifo[i].entries);
+ spin_lock_init(&hv_vcpu->tlb_flush_fifo[i].write_lock);
+ }
+
return 0;
}
@@ -989,6 +1018,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
r = true;
@@ -1273,6 +1303,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
case HV_X64_MSR_TSC_EMULATION_STATUS:
return hv_vcpu->cpuid_cache.features_eax &
HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
return hv_vcpu->cpuid_cache.features_edx &
@@ -1400,12 +1433,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
if (!host)
return 1;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_set_msr(vcpu, msr, data, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
return 0;
@@ -1526,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
@@ -1575,11 +1617,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_TSC_EMULATION_STATUS:
data = hv->hv_tsc_emulation_status;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_get_msr(vcpu, msr, pdata, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
@@ -1644,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = APIC_BUS_FREQUENCY;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
*pdata = data;
@@ -1736,7 +1781,30 @@ static void sparse_set_to_vcpu_mask(struct kvm *kvm, u64 *sparse_banks,
}
}
+static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_banks[])
+{
+ int valid_bit_nr = vp_id / HV_VCPUS_PER_SPARSE_BANK;
+ unsigned long sbank;
+
+ if (!test_bit(valid_bit_nr, (unsigned long *)&valid_bank_mask))
+ return false;
+
+ /*
+ * The index into the sparse bank is the number of preceding bits in
+ * the valid mask. Optimize for VMs with <64 vCPUs by skipping the
+ * fancy math if there can't possibly be preceding bits.
+ */
+ if (valid_bit_nr)
+ sbank = hweight64(valid_bank_mask & GENMASK_ULL(valid_bit_nr - 1, 0));
+ else
+ sbank = 0;
+
+ return test_bit(vp_id % HV_VCPUS_PER_SPARSE_BANK,
+ (unsigned long *)&sparse_banks[sbank]);
+}
+
struct kvm_hv_hcall {
+ /* Hypercall input data */
u64 param;
u64 ingpa;
u64 outgpa;
@@ -1747,59 +1815,179 @@ struct kvm_hv_hcall {
bool fast;
bool rep;
sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS];
-};
-static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
- int consumed_xmm_halves,
- u64 *sparse_banks, gpa_t offset)
-{
- u16 var_cnt;
- int i;
+ /*
+ * Current read offset when KVM reads hypercall input data gradually,
+ * either offset in bytes from 'ingpa' for regular hypercalls or the
+ * number of already consumed 'XMM halves' for 'fast' hypercalls.
+ */
+ union {
+ gpa_t data_offset;
+ int consumed_xmm_halves;
+ };
+};
- if (hc->var_cnt > 64)
- return -EINVAL;
- /* Ignore banks that cannot possibly contain a legal VP index. */
- var_cnt = min_t(u16, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS);
+static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u16 orig_cnt, u16 cnt_cap, u64 *data)
+{
+ /*
+ * Preserve the original count when ignoring entries via a "cap", KVM
+ * still needs to validate the guest input (though the non-XMM path
+ * punts on the checks).
+ */
+ u16 cnt = min(orig_cnt, cnt_cap);
+ int i, j;
if (hc->fast) {
/*
* Each XMM holds two sparse banks, but do not count halves that
* have already been consumed for hypercall parameters.
*/
- if (hc->var_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves)
+ if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves)
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- for (i = 0; i < var_cnt; i++) {
- int j = i + consumed_xmm_halves;
+
+ for (i = 0; i < cnt; i++) {
+ j = i + hc->consumed_xmm_halves;
if (j % 2)
- sparse_banks[i] = sse128_hi(hc->xmm[j / 2]);
+ data[i] = sse128_hi(hc->xmm[j / 2]);
else
- sparse_banks[i] = sse128_lo(hc->xmm[j / 2]);
+ data[i] = sse128_lo(hc->xmm[j / 2]);
}
return 0;
}
- return kvm_read_guest(kvm, hc->ingpa + offset, sparse_banks,
- var_cnt * sizeof(*sparse_banks));
+ return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data,
+ cnt * sizeof(*data));
+}
+
+static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc,
+ u64 *sparse_banks)
+{
+ if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */
+ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS,
+ sparse_banks);
+}
+
+static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[])
+{
+ return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries);
+}
+
+static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo,
+ u64 *entries, int count)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 flush_all_entry = KVM_HV_TLB_FLUSHALL_ENTRY;
+
+ if (!hv_vcpu)
+ return;
+
+ spin_lock(&tlb_flush_fifo->write_lock);
+
+ /*
+ * All entries should fit on the fifo leaving one free for 'flush all'
+ * entry in case another request comes in. In case there's not enough
+ * space, just put 'flush all' entry there.
+ */
+ if (count && entries && count < kfifo_avail(&tlb_flush_fifo->entries)) {
+ WARN_ON(kfifo_in(&tlb_flush_fifo->entries, entries, count) != count);
+ goto out_unlock;
+ }
+
+ /*
+ * Note: full fifo always contains 'flush all' entry, no need to check the
+ * return value.
+ */
+ kfifo_in(&tlb_flush_fifo->entries, &flush_all_entry, 1);
+
+out_unlock:
+ spin_unlock(&tlb_flush_fifo->write_lock);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 entries[KVM_HV_TLB_FLUSH_FIFO_SIZE];
+ int i, j, count;
+ gva_t gva;
+
+ if (!tdp_enabled || !hv_vcpu)
+ return -EINVAL;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ count = kfifo_out(&tlb_flush_fifo->entries, entries, KVM_HV_TLB_FLUSH_FIFO_SIZE);
+
+ for (i = 0; i < count; i++) {
+ if (entries[i] == KVM_HV_TLB_FLUSHALL_ENTRY)
+ goto out_flush_all;
+
+ /*
+ * Lower 12 bits of 'address' encode the number of additional
+ * pages to flush.
+ */
+ gva = entries[i] & PAGE_MASK;
+ for (j = 0; j < (entries[i] & ~PAGE_MASK) + 1; j++)
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, gva + j * PAGE_SIZE);
+
+ ++vcpu->stat.tlb_flush;
+ }
+ return 0;
+
+out_flush_all:
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+
+ /* Fall back to full flush. */
+ return -ENOSPC;
}
static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+ /*
+ * Normally, there can be no more than 'KVM_HV_TLB_FLUSH_FIFO_SIZE'
+ * entries on the TLB flush fifo. The last entry, however, needs to be
+ * always left free for 'flush all' entry which gets placed when
+ * there is not enough space to put all the requested entries.
+ */
+ u64 __tlb_flush_entries[KVM_HV_TLB_FLUSH_FIFO_SIZE - 1];
+ u64 *tlb_flush_entries;
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
+ struct kvm_vcpu *v;
+ unsigned long i;
bool all_cpus;
/*
- * The Hyper-V TLFS doesn't allow more than 64 sparse banks, e.g. the
- * valid mask is a u64. Fail the build if KVM's max allowed number of
- * vCPUs (>4096) would exceed this limit, KVM will additional changes
- * for Hyper-V support to avoid setting the guest up to fail.
+ * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS
+ * sparse banks. Fail the build if KVM's max allowed number of
+ * vCPUs (>4096) exceeds this limit.
*/
- BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > 64);
+ BUILD_BUG_ON(KVM_HV_MAX_SPARSE_VCPU_SET_BITS > HV_MAX_SPARSE_VCPU_BANKS);
+
+ /*
+ * 'Slow' hypercall's first parameter is the address in guest's memory
+ * where hypercall parameters are placed. This is either a GPA or a
+ * nested GPA when KVM is handling the call from L2 ('direct' TLB
+ * flush). Translate the address here so the memory can be uniformly
+ * read with kvm_read_guest().
+ */
+ if (!hc->fast && is_guest_mode(vcpu)) {
+ hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ if (unlikely(hc->ingpa == INVALID_GPA))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE) {
@@ -1807,14 +1995,17 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush.address_space = hc->ingpa;
flush.flags = hc->outgpa;
flush.processor_mask = sse128_lo(hc->xmm[0]);
+ hc->consumed_xmm_halves = 1;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa,
&flush, sizeof(flush))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush);
}
trace_kvm_hv_flush_tlb(flush.processor_mask,
- flush.address_space, flush.flags);
+ flush.address_space, flush.flags,
+ is_guest_mode(vcpu));
valid_bank_mask = BIT_ULL(0);
sparse_banks[0] = flush.processor_mask;
@@ -1834,16 +2025,18 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
flush_ex.flags = hc->outgpa;
memcpy(&flush_ex.hv_vp_set,
&hc->xmm[0], sizeof(hc->xmm[0]));
+ hc->consumed_xmm_halves = 2;
} else {
if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex,
sizeof(flush_ex))))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ hc->data_offset = sizeof(flush_ex);
}
trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask,
flush_ex.hv_vp_set.format,
flush_ex.address_space,
- flush_ex.flags);
+ flush_ex.flags, is_guest_mode(vcpu));
valid_bank_mask = flush_ex.hv_vp_set.valid_bank_mask;
all_cpus = flush_ex.hv_vp_set.format !=
@@ -1852,29 +2045,95 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (hc->var_cnt != hweight64(valid_bank_mask))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus)
- goto do_flush;
+ if (!all_cpus) {
+ if (!hc->var_cnt)
+ goto ret_success;
- if (!hc->var_cnt)
- goto ret_success;
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
+ return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ }
+
+ /*
+ * Hyper-V TLFS doesn't explicitly forbid non-empty sparse vCPU
+ * banks (and, thus, non-zero 'var_cnt') for the 'all vCPUs'
+ * case (HV_GENERIC_SET_ALL). Always adjust data_offset and
+ * consumed_xmm_halves to make sure TLB flush entries are read
+ * from the correct offset.
+ */
+ if (hc->fast)
+ hc->consumed_xmm_halves += hc->var_cnt;
+ else
+ hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]);
+ }
- if (kvm_get_sparse_vp_set(kvm, hc, 2, sparse_banks,
- offsetof(struct hv_tlb_flush_ex,
- hv_vp_set.bank_contents)))
+ if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) {
+ tlb_flush_entries = NULL;
+ } else {
+ if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
+ tlb_flush_entries = __tlb_flush_entries;
}
-do_flush:
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- if (all_cpus) {
- kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
- } else {
+ if (all_cpus && !is_guest_mode(vcpu)) {
+ kvm_for_each_vcpu(i, v, kvm) {
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_HV_TLB_FLUSH);
+ } else if (!is_guest_mode(vcpu)) {
sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, vcpu_mask);
+ for_each_set_bit(i, vcpu_mask, KVM_MAX_VCPUS) {
+ v = kvm_get_vcpu(kvm, i);
+ if (!v)
+ continue;
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, false);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
+ } else {
+ struct kvm_vcpu_hv *hv_v;
+
+ bitmap_zero(vcpu_mask, KVM_MAX_VCPUS);
+
+ kvm_for_each_vcpu(i, v, kvm) {
+ hv_v = to_hv_vcpu(v);
+
+ /*
+ * The following check races with nested vCPUs entering/exiting
+ * and/or migrating between L1's vCPUs, however the only case when
+ * KVM *must* flush the TLB is when the target L2 vCPU keeps
+ * running on the same L1 vCPU from the moment of the request until
+ * kvm_hv_flush_tlb() returns. TLB is fully flushed in all other
+ * cases, e.g. when the target L2 vCPU migrates to a different L1
+ * vCPU or when the corresponding L1 vCPU temporary switches to a
+ * different L2 vCPU while the request is being processed.
+ */
+ if (!hv_v || hv_v->nested.vm_id != hv_vcpu->nested.vm_id)
+ continue;
+
+ if (!all_cpus &&
+ !hv_is_vp_in_sparse_set(hv_v->nested.vp_id, valid_bank_mask,
+ sparse_banks))
+ continue;
+
+ __set_bit(i, vcpu_mask);
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(v, true);
+ hv_tlb_flush_enqueue(v, tlb_flush_fifo,
+ tlb_flush_entries, hc->rep_cnt);
+ }
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask);
}
ret_success:
@@ -1883,8 +2142,8 @@ ret_success:
((u64)hc->rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET);
}
-static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
- unsigned long *vcpu_bitmap)
+static void kvm_hv_send_ipi_to_many(struct kvm *kvm, u32 vector,
+ u64 *sparse_banks, u64 valid_bank_mask)
{
struct kvm_lapic_irq irq = {
.delivery_mode = APIC_DM_FIXED,
@@ -1894,7 +2153,9 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
unsigned long i;
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu_bitmap && !test_bit(i, vcpu_bitmap))
+ if (sparse_banks &&
+ !hv_is_vp_in_sparse_set(kvm_hv_get_vpindex(vcpu),
+ valid_bank_mask, sparse_banks))
continue;
/* We fail only when APIC is disabled */
@@ -1904,12 +2165,12 @@ static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector,
static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u64 *sparse_banks = hv_vcpu->sparse_banks;
struct kvm *kvm = vcpu->kvm;
struct hv_send_ipi_ex send_ipi_ex;
struct hv_send_ipi send_ipi;
- DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS);
u64 valid_bank_mask;
- u64 sparse_banks[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
u32 vector;
bool all_cpus;
@@ -1959,9 +2220,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
if (!hc->var_cnt)
goto ret_success;
- if (kvm_get_sparse_vp_set(kvm, hc, 1, sparse_banks,
- offsetof(struct hv_send_ipi_ex,
- vp_set.bank_contents)))
+ if (!hc->fast)
+ hc->data_offset = offsetof(struct hv_send_ipi_ex,
+ vp_set.bank_contents);
+ else
+ hc->consumed_xmm_halves = 1;
+
+ if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
@@ -1969,13 +2234,10 @@ check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
- if (all_cpus) {
- kvm_send_ipi_to_many(kvm, vector, NULL);
- } else {
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, vcpu_mask);
-
- kvm_send_ipi_to_many(kvm, vector, vcpu_mask);
- }
+ if (all_cpus)
+ kvm_hv_send_ipi_to_many(kvm, vector, NULL, 0);
+ else
+ kvm_hv_send_ipi_to_many(kvm, vector, sparse_banks, valid_bank_mask);
ret_success:
return HV_STATUS_SUCCESS;
@@ -2062,10 +2324,25 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result)
{
+ u32 tlb_lock_count = 0;
+ int ret;
+
+ if (hv_result_success(result) && is_guest_mode(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu) &&
+ kvm_read_guest(vcpu->kvm, to_hv_vcpu(vcpu)->nested.pa_page_gpa,
+ &tlb_lock_count, sizeof(tlb_lock_count)))
+ result = HV_STATUS_INVALID_HYPERCALL_INPUT;
+
trace_kvm_hv_hypercall_done(result);
kvm_hv_hypercall_set_result(vcpu, result);
++vcpu->stat.hypercalls;
- return kvm_skip_emulated_instruction(vcpu);
+
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ if (tlb_lock_count)
+ kvm_x86_ops.nested_ops->hv_inject_synthetic_vmexit_post_tlb_flush(vcpu);
+
+ return ret;
}
static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
@@ -2178,6 +2455,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
case HVCALL_SEND_IPI:
return hv_vcpu->cpuid_cache.enlightenments_eax &
HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
default:
break;
}
@@ -2270,14 +2550,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -2336,15 +2609,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_OPERATION_DENIED;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
}
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
@@ -2352,6 +2624,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
hypercall_complete:
return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
}
void kvm_hv_init_vm(struct kvm *kvm)
@@ -2491,9 +2772,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
ent->ebx |= HV_POST_MESSAGES;
ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
@@ -2502,6 +2785,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->ebx |= HV_DEBUGGING;
ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE;
ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
+ ent->edx |= HV_FEATURE_EXT_GVA_RANGES_FLUSH;
/*
* Direct Synthetic timers only make sense with in-kernel
@@ -2545,6 +2829,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
+ ent->eax |= HV_X64_NESTED_DIRECT_FLUSH;
ent->eax |= HV_X64_NESTED_MSR_BITMAP;
ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
break;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 1030b1b50552..f83b8db72b11 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -22,6 +22,7 @@
#define __ARCH_X86_KVM_HYPERV_H__
#include <linux/kvm_host.h>
+#include "x86.h"
/* "Hv#1" signature */
#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
@@ -107,8 +108,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages);
void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu);
-bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu,
- struct hv_vp_assist_page *assist_page);
+int kvm_hv_get_assist_page(struct kvm_vcpu *vcpu);
static inline struct kvm_vcpu_hv_stimer *to_hv_stimer(struct kvm_vcpu *vcpu,
int timer_index)
@@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
HV_SYNIC_STIMER_COUNT);
}
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
@@ -151,4 +178,64 @@ int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+static inline struct kvm_vcpu_hv_tlb_flush_fifo *kvm_hv_get_tlb_flush_fifo(struct kvm_vcpu *vcpu,
+ bool is_guest_mode)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ int i = is_guest_mode ? HV_L2_TLB_FLUSH_FIFO :
+ HV_L1_TLB_FLUSH_FIFO;
+
+ return &hv_vcpu->tlb_flush_fifo[i];
+}
+
+static inline void kvm_hv_vcpu_purge_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv_tlb_flush_fifo *tlb_flush_fifo;
+
+ if (!to_hv_vcpu(vcpu) || !kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ return;
+
+ tlb_flush_fifo = kvm_hv_get_tlb_flush_fifo(vcpu, is_guest_mode(vcpu));
+
+ kfifo_reset_out(&tlb_flush_fifo->entries);
+}
+
+static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ return hv_vcpu &&
+ (hv_vcpu->cpuid_cache.nested_eax & HV_X64_NESTED_DIRECT_FLUSH);
+}
+
+static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ u16 code;
+
+ if (!hv_vcpu)
+ return false;
+
+ code = is_64_bit_hypercall(vcpu) ? kvm_rcx_read(vcpu) :
+ kvm_rax_read(vcpu);
+
+ return (code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX ||
+ code == HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX);
+}
+
+static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu)
+{
+ if (!to_hv_vcpu(vcpu))
+ return 0;
+
+ if (!kvm_hv_assist_page_enabled(vcpu))
+ return 0;
+
+ return kvm_hv_get_assist_page(vcpu);
+}
+
+int kvm_hv_vcpu_flush_tlb(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e0a7a0e7a73c..cd57a517d04a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -30,7 +30,7 @@
* Based on QEMU and Xen.
*/
-#define pr_fmt(fmt) "pit: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
if (ps->period < min_period) {
pr_info_ratelimited(
- "kvm: requested %lld ns "
+ "requested %lld ns "
"i8254 timer period limited to %lld ns\n",
ps->period, min_period);
ps->period = min_period;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e1bb6218bb96..4756bcb5724f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,8 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -35,7 +37,7 @@
#include "trace.h"
#define pr_pic_unimpl(fmt, ...) \
- pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
static void pic_irq_request(struct kvm *kvm, int level);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 765943d7cfa5..995eb5054360 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -26,6 +26,7 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
* Based on Xen 3.1 code.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -367,9 +368,39 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
- if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
- && ioapic->irr & (1 << index))
- ioapic_service(ioapic, index, false);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ ioapic->irr & (1 << index) && !e->fields.mask && !e->fields.remote_irr) {
+ /*
+ * Pending status in irr may be outdated: the IRQ line may have
+ * already been deasserted by a device while the IRQ was masked.
+ * This occurs, for instance, if the interrupt is handled in a
+ * Linux guest as a oneshot interrupt (IRQF_ONESHOT). In this
+ * case the guest acknowledges the interrupt to the device in
+ * its threaded irq handler, i.e. after the EOI but before
+ * unmasking, so at the time of unmasking the IRQ line is
+ * already down but our pending irr bit is still set. In such
+ * cases, injecting this pending interrupt to the guest is
+ * buggy: the guest will receive an extra unwanted interrupt.
+ *
+ * So we need to check here if the IRQ is actually still pending.
+ * As we are generally not able to probe the IRQ line status
+ * directly, we do it through irqfd resampler. Namely, we clear
+ * the pending status and notify the resampler that this interrupt
+ * is done, without actually injecting it into the guest. If the
+ * IRQ line is actually already deasserted, we are done. If it is
+ * still asserted, a new interrupt will be shortly triggered
+ * through irqfd and injected into the guest.
+ *
+ * If, however, it's not possible to resample (no irqfd resampler
+ * registered for this irq), then unconditionally inject this
+ * pending interrupt into the guest, so the guest will not miss
+ * an interrupt, although may get an extra unwanted interrupt.
+ */
+ if (kvm_notify_irqfd_resampler(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index))
+ ioapic->irr &= ~(1 << index);
+ else
+ ioapic_service(ioapic, index, false);
+ }
if (e->fields.delivery_mode == APIC_DM_FIXED) {
struct kvm_lapic_irq irq;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index f371f1292ca3..b2c397dd2bc6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -7,6 +7,7 @@
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/export.h>
#include <linux/kvm_host.h>
@@ -31,7 +32,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
return r;
}
-EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
* check if there is a pending userspace external interrupt
@@ -150,7 +150,6 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_inject_timer_irqs(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
{
@@ -165,3 +164,8 @@ bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
return resample ? irqchip_kernel(kvm) : irqchip_in_kernel(kvm);
}
+
+bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm);
+}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 0687162c4f22..16d076a1b91a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -8,6 +8,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
if (irq_source_id >= BITS_PER_LONG) {
- printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ pr_warn("exhausted allocatable IRQ sources!\n");
irq_source_id = -EFAULT;
goto unlock;
}
@@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
irq_source_id >= BITS_PER_LONG) {
- printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ pr_err("IRQ source ID out of range!\n");
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
@@ -426,8 +427,9 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
if (irq.trig_mode &&
- kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- irq.dest_id, irq.dest_mode))
+ (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ irq.dest_id, irq.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, irq.vector)))
__set_bit(irq.vector, ioapic_handled_vectors);
}
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3febc342360c..75eae9c4998a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -4,7 +4,7 @@
#include <linux/kvm_host.h>
-#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
+#define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP)
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE)
@@ -76,6 +76,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
}
/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/*
* The "raw" register helpers are only for cases where the full 64 bits of a
* register are read/written irrespective of current vCPU mode. In other words,
* odds are good you shouldn't be using the raw variants.
@@ -145,6 +157,14 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr0 & mask;
}
+static __always_inline bool kvm_is_cr0_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr0_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr0_bit));
+
+ return !!kvm_read_cr0_bits(vcpu, cr0_bit);
+}
+
static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, ~0UL);
@@ -159,6 +179,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static __always_inline bool kvm_is_cr4_bit_set(struct kvm_vcpu *vcpu,
+ unsigned long cr4_bit)
+{
+ BUILD_BUG_ON(!is_power_of_2(cr4_bit));
+
+ return !!kvm_read_cr4_bits(vcpu, cr4_bit);
+}
+
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
@@ -200,9 +228,4 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
-static inline bool is_smm(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.hflags & HF_SMM_MASK;
-}
-
#endif
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 89246446d6aa..ab65f3a47dfd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -117,16 +117,6 @@ struct x86_emulate_ops {
struct x86_exception *fault, bool system);
/*
- * read_phys: Read bytes of standard (non-emulated/special) memory.
- * Used for descriptor reading.
- * @addr: [IN ] Physical address from which to read.
- * @val: [OUT] Value read from memory.
- * @bytes: [IN ] Number of bytes to read from memory.
- */
- int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
- void *val, unsigned int bytes);
-
- /*
* write_std: Write bytes of standard (non-emulated/special) memory.
* Used for descriptor writing.
* @addr: [IN ] Linear address to which to write.
@@ -209,11 +199,8 @@ struct x86_emulate_ops {
int (*cpl)(struct x86_emulate_ctxt *ctxt);
void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
- u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
- void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
@@ -233,9 +220,9 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
- unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
- void (*exiting_smm)(struct x86_emulate_ctxt *ctxt);
- int (*leave_smm)(struct x86_emulate_ctxt *ctxt, const char *smstate);
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt);
+ int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
};
@@ -289,11 +276,6 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
};
-/* These match some of the HF_* flags defined in kvm_host.h */
-#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
-#define X86EMUL_SMM_MASK (1 << 6)
-#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
-
/*
* fastop functions are declared as taking a never-defined fastop parameter,
* so they can't be called from C directly.
@@ -526,4 +508,35 @@ void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
+static inline ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ if (!(ctxt->regs_valid & (1 << nr))) {
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->_regs[nr] = ctxt->ops->read_gpr(ctxt, nr);
+ }
+ return ctxt->_regs[nr];
+}
+
+static inline ulong *reg_write(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ if (KVM_EMULATOR_BUG_ON(nr >= NR_EMULATOR_GPRS, ctxt))
+ nr &= NR_EMULATOR_GPRS - 1;
+
+ BUILD_BUG_ON(sizeof(ctxt->regs_dirty) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+ BUILD_BUG_ON(sizeof(ctxt->regs_valid) * BITS_PER_BYTE < NR_EMULATOR_GPRS);
+
+ ctxt->regs_valid |= 1 << nr;
+ ctxt->regs_dirty |= 1 << nr;
+ return &ctxt->_regs[nr];
+}
+
+static inline ulong *reg_rmw(struct x86_emulate_ctxt *ctxt, unsigned nr)
+{
+ reg_read(ctxt, nr);
+ return reg_write(ctxt, nr);
+}
+
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index ee4f696a0782..ded0bd688c65 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mshyperv.h>
@@ -9,17 +10,22 @@
#include "hyperv.h"
#include "kvm_onhyperv.h"
+struct kvm_hv_tlb_range {
+ u64 start_gfn;
+ u64 pages;
+};
+
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
- struct kvm_tlb_range *range = data;
+ struct kvm_hv_tlb_range *range = data;
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
range->pages);
}
static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
- struct kvm_tlb_range *range)
+ struct kvm_hv_tlb_range *range)
{
if (range)
return hyperv_flush_guest_mapping_range(root_tdp,
@@ -28,8 +34,8 @@ static inline int hv_remote_flush_root_tdp(hpa_t root_tdp,
return hyperv_flush_guest_mapping(root_tdp);
}
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static int __hv_flush_remote_tlbs_range(struct kvm *kvm,
+ struct kvm_hv_tlb_range *range)
{
struct kvm_arch *kvm_arch = &kvm->arch;
struct kvm_vcpu *vcpu;
@@ -85,19 +91,29 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
spin_unlock(&kvm_arch->hv_root_tdp_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb_with_range);
-int hv_remote_flush_tlb(struct kvm *kvm)
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn, gfn_t nr_pages)
+{
+ struct kvm_hv_tlb_range range = {
+ .start_gfn = start_gfn,
+ .pages = nr_pages,
+ };
+
+ return __hv_flush_remote_tlbs_range(kvm, &range);
+}
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs_range);
+
+int hv_flush_remote_tlbs(struct kvm *kvm)
{
- return hv_remote_flush_tlb_with_range(kvm, NULL);
+ return __hv_flush_remote_tlbs_range(kvm, NULL);
}
-EXPORT_SYMBOL_GPL(hv_remote_flush_tlb);
+EXPORT_SYMBOL_GPL(hv_flush_remote_tlbs);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
struct kvm_arch *kvm_arch = &vcpu->kvm->arch;
- if (kvm_x86_ops.tlb_remote_flush == hv_remote_flush_tlb) {
+ if (kvm_x86_ops.flush_remote_tlbs == hv_flush_remote_tlbs) {
spin_lock(&kvm_arch->hv_root_tdp_lock);
vcpu->arch.hv_root_tdp = root_tdp;
if (root_tdp != kvm_arch->hv_root_tdp)
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 287e98ef9df3..f9ca3e7432b2 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -7,11 +7,15 @@
#define __ARCH_X86_KVM_KVM_ONHYPERV_H__
#if IS_ENABLED(CONFIG_HYPERV)
-int hv_remote_flush_tlb_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range);
-int hv_remote_flush_tlb(struct kvm *kvm);
+int hv_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages);
+int hv_flush_remote_tlbs(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
+static inline int hv_flush_remote_tlbs(struct kvm *kvm)
+{
+ return -EOPNOTSUPP;
+}
+
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d7639d126e6c..3c300a196bdf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -15,6 +15,7 @@
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -42,6 +43,7 @@
#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
+#include "smm.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -159,16 +161,25 @@ bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
&& !(kvm_mwait_in_guest(vcpu->kvm) ||
kvm_can_post_timer_interrupt(vcpu));
}
-EXPORT_SYMBOL_GPL(kvm_can_use_hv_timer);
static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
{
return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
- switch (map->mode) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
case KVM_APIC_MODE_X2APIC: {
u32 offset = (dest_id >> 16) * 16;
u32 max_apic_id = map->max_apic_id;
@@ -193,8 +204,10 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
*mask = dest_id & 0xf;
return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
default:
- /* Not optimized. */
+ WARN_ON_ONCE(1);
return false;
}
}
@@ -206,6 +219,150 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * For simplicity, KVM always allocates enough space for all possible
+ * xAPIC IDs. Yell, but don't kill the VM, as KVM can continue on
+ * without the optimized map.
+ */
+ if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
+ return -EINVAL;
+
+ /*
+ * Bail if a vCPU was added and/or enabled its APIC between allocating
+ * the map and doing the actual calculations for the map. Note, KVM
+ * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
+ * the compiler decides to reload x2apic_id after this check.
+ */
+ if (x2apic_id > new->max_apic_id)
+ return -E2BIG;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic)) {
+ WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ return;
+ }
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
/*
* CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
*
@@ -224,6 +381,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch = false;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@ -256,54 +414,41 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
goto out;
new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_lapic **cluster;
- u16 mask;
- u32 ldr;
- u8 xapic_id;
- u32 x2apic_id;
-
if (!kvm_apic_present(vcpu))
continue;
- xapic_id = kvm_xapic_id(apic);
- x2apic_id = kvm_x2apic_id(apic);
-
- /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
- if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
- x2apic_id <= new->max_apic_id)
- new->phys_map[x2apic_id] = apic;
- /*
- * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
- * prevent them from masking VCPUs with APIC ID <= 0xff.
- */
- if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
- new->phys_map[xapic_id] = apic;
-
- if (!kvm_apic_sw_enabled(apic))
- continue;
-
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
- if (apic_x2apic_mode(apic)) {
- new->mode |= KVM_APIC_MODE_X2APIC;
- } else if (ldr) {
- ldr = GET_APIC_LOGICAL_ID(ldr);
- if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
- new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
- else
- new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ kvfree(new);
+ new = NULL;
+ goto out;
}
- if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
- continue;
-
- if (mask)
- cluster[ffs(mask) - 1] = apic;
+ kvm_recalculate_logical_map(new, vcpu);
}
out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
@@ -360,11 +505,6 @@ static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
-static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
-{
- return ((id >> 4) << 16) | (1 << (id & 0xf));
-}
-
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = kvm_apic_calc_x2apic_ldr(id);
@@ -941,8 +1081,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
{
if (!kvm->arch.disabled_lapic_found) {
kvm->arch.disabled_lapic_found = true;
- printk(KERN_INFO
- "Disabled LAPIC found during irq injection\n");
+ pr_info("Disabled LAPIC found during irq injection\n");
}
}
@@ -951,7 +1090,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
{
if (kvm->arch.x2apic_broadcast_quirk_disabled) {
if ((irq->dest_id == APIC_BROADCAST &&
- map->mode != KVM_APIC_MODE_X2APIC))
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
return true;
if (irq->dest_id == X2APIC_BROADCAST)
return true;
@@ -1170,9 +1309,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- result = 1;
- kvm_make_request(KVM_REQ_SMI, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!kvm_inject_smi(vcpu)) {
+ kvm_vcpu_kick(vcpu);
+ result = 1;
+ }
break;
case APIC_DM_NMI:
@@ -1363,7 +1503,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
ktime_t remaining, now;
s64 ns;
- u32 tmcct;
ASSERT(apic != NULL);
@@ -1378,10 +1517,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
+ return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1441,19 +1577,15 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
{
- unsigned char alignment = offset & 0xf;
- u32 result;
- /* this bitmask has a bit cleared for each reserved register */
+ /* Leave bits '0' for reserved and write-only registers. */
u64 valid_reg_mask =
APIC_REG_MASK(APIC_ID) |
APIC_REG_MASK(APIC_LVR) |
APIC_REG_MASK(APIC_TASKPRI) |
APIC_REG_MASK(APIC_PROCPRI) |
APIC_REG_MASK(APIC_LDR) |
- APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_SPIV) |
APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
@@ -1473,21 +1605,33 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
- /*
- * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
- * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
- * manually handled by the caller.
- */
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
if (!apic_x2apic_mode(apic))
valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_ICR2);
- else
- WARN_ON_ONCE(offset == APIC_ICR);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
if (alignment + len > 4)
return 1;
- if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
return 1;
result = __apic_read(apic, offset & ~0xf);
@@ -1559,7 +1703,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
if (apic->lapic_timer.period < min_period) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
+ "vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
@@ -1840,11 +1984,15 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
if (unlikely(count_reg != APIC_TMICT)) {
deadline = tmict_to_ns(apic,
kvm_lapic_get_reg(apic, count_reg));
- if (unlikely(deadline <= 0))
- deadline = apic->lapic_timer.period;
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
else if (unlikely(deadline > apic->lapic_timer.period)) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested lapic timer restore with "
+ "vcpu %i: requested lapic timer restore with "
"starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
"Using initial count to start timer.\n",
apic->vcpu->vcpu_id,
@@ -1912,7 +2060,6 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
}
-EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
static void cancel_hv_timer(struct kvm_lapic *apic)
{
@@ -2068,19 +2215,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
-{
- struct kvm *kvm = apic->vcpu->kvm;
-
- if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
- return;
-
- if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
- return;
-
- kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
-}
-
static int get_lvt_index(u32 reg)
{
if (reg == APIC_LVTCMCI)
@@ -2101,7 +2235,6 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- kvm_lapic_xapic_id_updated(apic);
} else {
ret = 1;
}
@@ -2219,10 +2352,14 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic))
- kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
- else
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
break;
default:
ret = 1;
@@ -2284,23 +2421,18 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 val;
- if (apic_x2apic_mode(apic)) {
- if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm))
- return;
- } else {
- val = kvm_lapic_get_reg(apic, offset);
- }
-
/*
* ICR is a single 64-bit register when x2APIC is enabled. For legacy
* xAPIC, ICR writes need to go down the common (slightly slower) path
* to get the upper half from ICR2.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+ val = kvm_lapic_get_reg64(apic, APIC_ICR);
kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
trace_kvm_apic_write(APIC_ICR, val);
} else {
/* TODO: optimize to just emulate side effect w/o one more write */
+ val = kvm_lapic_get_reg(apic, offset);
kvm_lapic_reg_write(apic, offset, (u32)val);
}
}
@@ -2394,11 +2526,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
}
- if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
- kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
- kvm_vcpu_update_apicv(vcpu);
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
}
@@ -2429,8 +2565,79 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
*/
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
+ apic->highest_isr_cache = -1;
+}
+
+int kvm_alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ void __user *hva;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ goto out;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
+ goto out;
+ }
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_memslot_enabled = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
@@ -2485,7 +2692,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2722,8 +2928,6 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
- } else {
- kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
@@ -2772,7 +2976,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
__start_apic_timer(apic, APIC_TMCCT);
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
@@ -2943,13 +3146,17 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
{
/*
- * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
* can be written as such, all other registers remain accessible only
* through 32-bit reads/writes.
*/
if (reg == APIC_ICR)
return kvm_x2apic_icr_write(apic, data);
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
@@ -2972,9 +3179,6 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR)
- return 1;
-
return kvm_lapic_msr_read(apic, reg, data);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index a5ac4a5a5179..0a0ea4b5dd8c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -7,7 +7,7 @@
#include <linux/kvm_host.h>
#include "hyperv.h"
-#include "kvm_cache_regs.h"
+#include "smm.h"
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
@@ -112,6 +112,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
@@ -144,6 +146,8 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
@@ -188,11 +192,11 @@ static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
extern struct static_key_false_deferred apic_hw_disabled;
-static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
+static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic)
{
if (static_branch_unlikely(&apic_hw_disabled.key))
return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
- return MSR_IA32_APICBASE_ENABLE;
+ return true;
}
extern struct static_key_false_deferred apic_sw_disabled;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6bdaacb6faa0..92d5a1924fc1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -113,6 +113,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
@@ -132,7 +134,7 @@ static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
{
BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
- return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)
? cr3 & X86_CR3_PCID_MASK
: 0;
}
@@ -153,6 +155,24 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_role.level);
}
+static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ /*
+ * When EPT is enabled, KVM may passthrough CR0.WP to the guest, i.e.
+ * @mmu's snapshot of CR0.WP and thus all related paging metadata may
+ * be stale. Refresh CR0.WP and the metadata on-demand when checking
+ * for permission faults. Exempt nested MMUs, i.e. MMUs for shadowing
+ * nEPT and nNPT, as CR0.WP is ignored in both cases. Note, KVM does
+ * need to refresh nested_mmu, a.k.a. the walker used to translate L2
+ * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+ */
+ if (!tdp_enabled || mmu == &vcpu->arch.guest_mmu)
+ return;
+
+ __kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+}
+
/*
* Check if a given access (described through the I/D, W/R and U/S bits of a
* page fault error code pfec) causes a permission fault with the given PTE
@@ -184,8 +204,12 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
- bool fault = (mmu->permissions[index] >> pte_access) & 1;
u32 errcode = PFERR_PRESENT_MASK;
+ bool fault;
+
+ kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
+
+ fault = (mmu->permissions[index] >> pte_access) & 1;
WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
if (unlikely(mmu->pkru_mask)) {
@@ -230,14 +254,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
}
#ifdef CONFIG_X86_64
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+extern bool tdp_mmu_enabled;
#else
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#define tdp_mmu_enabled false
#endif
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
- return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
}
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1ccb769f62af..6eaa3d6994ae 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -14,6 +14,7 @@
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "irq.h"
#include "ioapic.h"
@@ -22,6 +23,7 @@
#include "tdp_mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "smm.h"
#include "kvm_emulate.h"
#include "cpuid.h"
#include "spte.h"
@@ -42,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
#include <linux/kthread.h>
#include <asm/page.h>
@@ -98,6 +101,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
*/
bool tdp_enabled = false;
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+#endif
+
static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
@@ -115,17 +125,31 @@ module_param(dbg, bool, 0644);
#define PTE_LIST_EXT 14
/*
- * Slight optimization of cacheline layout, by putting `more' and `spte_count'
- * at the start; then accessing it will only use one single cacheline for
- * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ * struct pte_list_desc is the core data structure used to implement a custom
+ * list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
+ * given GFN when used in the context of rmaps. Using a custom list allows KVM
+ * to optimize for the common case where many GFNs will have at most a handful
+ * of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
+ * memory footprint, which in turn improves runtime performance by exploiting
+ * cache locality.
+ *
+ * A list is comprised of one or more pte_list_desc objects (descriptors).
+ * Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
+ * is full and a new SPTEs needs to be added, a new descriptor is allocated and
+ * becomes the head of the list. This means that by definitions, all tail
+ * descriptors are full.
+ *
+ * Note, the meta data fields are deliberately placed at the start of the
+ * structure to optimize the cacheline layout; accessing the descriptor will
+ * touch only a single cacheline so long as @spte_count<=6 (or if only the
+ * descriptors metadata is accessed).
*/
struct pte_list_desc {
struct pte_list_desc *more;
- /*
- * Stores number of entries stored in the pte_list_desc. No need to be
- * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
- */
- u64 spte_count;
+ /* The number of PTEs stored in _this_ descriptor. */
+ u32 spte_count;
+ /* The number of PTEs stored in all tails of this descriptor. */
+ u32 tail_count;
u64 *sptes[PTE_LIST_EXT];
};
@@ -232,32 +256,46 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
return regs;
}
-static inline bool kvm_available_flush_tlb_with_range(void)
+static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops.tlb_remote_flush_with_range;
+ return kvm_read_cr3(vcpu);
+}
+
+static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
+ return kvm_read_cr3(vcpu);
+
+ return mmu->get_guest_pgd(vcpu);
}
-static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
- struct kvm_tlb_range *range)
+static inline bool kvm_available_flush_remote_tlbs_range(void)
{
- int ret = -ENOTSUPP;
+ return kvm_x86_ops.flush_remote_tlbs_range;
+}
- if (range && kvm_x86_ops.tlb_remote_flush_with_range)
- ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range);
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages)
+{
+ int ret = -EOPNOTSUPP;
+ if (kvm_x86_ops.flush_remote_tlbs_range)
+ ret = static_call(kvm_x86_flush_remote_tlbs_range)(kvm, start_gfn,
+ nr_pages);
if (ret)
kvm_flush_remote_tlbs(kvm);
}
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages)
-{
- struct kvm_tlb_range range;
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
- range.start_gfn = start_gfn;
- range.pages = pages;
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
- kvm_flush_remote_tlbs_with_range(kvm, &range);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
}
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
@@ -608,9 +646,14 @@ static bool mmu_spte_age(u64 *sptep)
return true;
}
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_begin();
} else {
/*
@@ -629,7 +672,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_end();
} else {
/*
@@ -799,18 +842,34 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- if (sp->lpage_disallowed)
+ /*
+ * If it's possible to replace the shadow page with an NX huge page,
+ * i.e. if the shadow page is the only thing currently preventing KVM
+ * from using a huge page, add the shadow page to the list of "to be
+ * zapped for NX recovery" pages. Note, the shadow page can already be
+ * on the list if KVM is reusing an existing shadow page, i.e. if KVM
+ * links a shadow page at multiple points.
+ */
+ if (!list_empty(&sp->possible_nx_huge_page_link))
return;
++kvm->stat.nx_lpage_splits;
- list_add_tail(&sp->lpage_disallowed_link,
- &kvm->arch.lpage_disallowed_mmu_pages);
- sp->lpage_disallowed = true;
+ list_add_tail(&sp->possible_nx_huge_page_link,
+ &kvm->arch.possible_nx_huge_pages);
+}
+
+static void account_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ bool nx_huge_page_possible)
+{
+ sp->nx_huge_page_disallowed = true;
+
+ if (nx_huge_page_possible)
+ track_possible_nx_huge_page(kvm, sp);
}
static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -830,16 +889,25 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ if (list_empty(&sp->possible_nx_huge_page_link))
+ return;
+
--kvm->stat.nx_lpage_splits;
- sp->lpage_disallowed = false;
- list_del(&sp->lpage_disallowed_link);
+ list_del_init(&sp->possible_nx_huge_page_link);
+}
+
+static void unaccount_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ sp->nx_huge_page_disallowed = false;
+
+ untrack_possible_nx_huge_page(kvm, sp);
}
-static struct kvm_memory_slot *
-gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
+static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -878,53 +946,69 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
+ desc->tail_count = 0;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->spte_count == PTE_LIST_EXT) {
- count += PTE_LIST_EXT;
- if (!desc->more) {
- desc->more = kvm_mmu_memory_cache_alloc(cache);
- desc = desc->more;
- desc->spte_count = 0;
- break;
- }
- desc = desc->more;
+ count = desc->tail_count + desc->spte_count;
+
+ /*
+ * If the previous head is full, allocate a new head descriptor
+ * as tail descriptors are always kept full.
+ */
+ if (desc->spte_count == PTE_LIST_EXT) {
+ desc = kvm_mmu_memory_cache_alloc(cache);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->spte_count = 0;
+ desc->tail_count = count;
+ rmap_head->val = (unsigned long)desc | 1;
}
- count += desc->spte_count;
desc->sptes[desc->spte_count++] = spte;
}
return count;
}
-static void
-pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
- struct pte_list_desc *desc, int i,
- struct pte_list_desc *prev_desc)
+static void pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i)
{
- int j = desc->spte_count - 1;
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ int j = head_desc->spte_count - 1;
- desc->sptes[i] = desc->sptes[j];
- desc->sptes[j] = NULL;
- desc->spte_count--;
- if (desc->spte_count)
+ /*
+ * The head descriptor should never be empty. A new head is added only
+ * when adding an entry and the previous head is full, and heads are
+ * removed (this flow) when they become empty.
+ */
+ BUG_ON(j < 0);
+
+ /*
+ * Replace the to-be-freed SPTE with the last valid entry from the head
+ * descriptor to ensure that tail descriptors are full at all times.
+ * Note, this also means that tail_count is stable for each descriptor.
+ */
+ desc->sptes[i] = head_desc->sptes[j];
+ head_desc->sptes[j] = NULL;
+ head_desc->spte_count--;
+ if (head_desc->spte_count)
return;
- if (!prev_desc && !desc->more)
+
+ /*
+ * The head descriptor is empty. If there are no tail descriptors,
+ * nullify the rmap head to mark the list as emtpy, else point the rmap
+ * head at the next descriptor, i.e. the new head.
+ */
+ if (!head_desc->more)
rmap_head->val = 0;
else
- if (prev_desc)
- prev_desc->more = desc->more;
- else
- rmap_head->val = (unsigned long)desc->more | 1;
- mmu_free_pte_list_desc(desc);
+ rmap_head->val = (unsigned long)head_desc->more | 1;
+ mmu_free_pte_list_desc(head_desc);
}
static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- struct pte_list_desc *prev_desc;
int i;
if (!rmap_head->val) {
@@ -940,16 +1024,13 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
} else {
rmap_printk("%p many->many\n", spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- prev_desc = NULL;
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(rmap_head,
- desc, i, prev_desc);
+ pte_list_desc_remove_entry(rmap_head, desc, i);
return;
}
}
- prev_desc = desc;
desc = desc->more;
}
pr_err("%s: %p many->many\n", __func__, spte);
@@ -996,7 +1077,6 @@ out:
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- unsigned int count = 0;
if (!rmap_head->val)
return 0;
@@ -1004,13 +1084,7 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
return 1;
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
-
- while (desc) {
- count += desc->spte_count;
- desc = desc->more;
- }
-
- return count;
+ return desc->tail_count + desc->spte_count;
}
static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
@@ -1022,14 +1096,6 @@ static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
-static bool rmap_can_add(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_memory_cache *mc;
-
- mc = &vcpu->arch.mmu_pte_list_desc_cache;
- return kvm_mmu_memory_cache_nr_free_objects(mc);
-}
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_memslots *slots;
@@ -1148,8 +1214,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
drop_spte(kvm, sptep);
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
}
/*
@@ -1253,7 +1318,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
@@ -1286,7 +1351,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
@@ -1369,7 +1434,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
}
}
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
@@ -1429,8 +1494,8 @@ restart:
}
}
- if (need_flush && kvm_available_flush_tlb_with_range()) {
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ if (need_flush && kvm_available_flush_remote_tlbs_range()) {
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
return false;
}
@@ -1454,8 +1519,8 @@ struct slot_rmap_walk_iterator {
struct kvm_rmap_head *end_rmap;
};
-static void
-rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
+ int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
@@ -1463,10 +1528,10 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
-static void
-slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- const struct kvm_memory_slot *slot, int start_level,
- int end_level, gfn_t start_gfn, gfn_t end_gfn)
+static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ const struct kvm_memory_slot *slot,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
iterator->start_level = start_level;
@@ -1532,7 +1597,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
@@ -1545,7 +1610,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
return flush;
@@ -1600,8 +1665,7 @@ static void __rmap_add(struct kvm *kvm,
kvm->stat.max_mmu_rmap_size = rmap_count;
if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
kvm_zap_all_rmap_sptes(kvm, rmap_head);
- kvm_flush_remote_tlbs_with_address(
- kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
}
}
@@ -1620,7 +1684,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
@@ -1633,7 +1697,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
@@ -1645,7 +1709,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *pos;
u64 *end;
- for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
+ for (pos = spt, end = pos + SPTE_ENT_PER_PAGE; pos != end; pos++)
if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
@@ -1740,12 +1804,6 @@ static void mark_unsync(u64 *spte)
kvm_mmu_mark_parents_unsync(sp);
}
-static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- return -1;
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -1793,7 +1851,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
continue;
}
- child = to_shadow_page(ent & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(ent);
if (child->unsync_children) {
if (mmu_pages_add(pvec, child, i))
@@ -1865,10 +1923,79 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
+static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
+
+ /*
+ * Ignore various flags when verifying that it's safe to sync a shadow
+ * page using the current MMU context.
+ *
+ * - level: not part of the overall MMU role and will never match as the MMU's
+ * level tracks the root level
+ * - access: updated based on the new guest PTE
+ * - quadrant: not part of the overall MMU role (similar to level)
+ */
+ const union kvm_mmu_page_role sync_role_ign = {
+ .level = 0xf,
+ .access = 0x7,
+ .quadrant = 0x3,
+ .passthrough = 0x1,
+ };
+
+ /*
+ * Direct pages can never be unsync, and KVM should never attempt to
+ * sync a shadow page for a different MMU context, e.g. if the role
+ * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
+ * reserved bits checks will be wrong, etc...
+ */
+ if (WARN_ON_ONCE(sp->role.direct || !vcpu->arch.mmu->sync_spte ||
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
+ return false;
+
+ return true;
+}
+
+static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
+{
+ if (!sp->spt[i])
+ return 0;
+
+ return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
+}
+
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ int flush = 0;
+ int i;
+
+ if (!kvm_sync_page_check(vcpu, sp))
+ return -1;
+
+ for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
+ int ret = kvm_sync_spte(vcpu, sp, i);
+
+ if (ret < -1)
+ return -1;
+ flush |= ret;
+ }
+
+ /*
+ * Note, any flush is purely for KVM's correctness, e.g. when dropping
+ * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
+ * unmap or dirty logging event doesn't fail to flush. The guest is
+ * responsible for flushing the TLB to ensure any changes in protection
+ * bits are recognized, i.e. until the guest flushes or page faults on
+ * a relevant address, KVM is architecturally allowed to let vCPUs use
+ * cached translations with the old protection bits.
+ */
+ return flush;
+}
+
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+ int ret = __kvm_sync_page(vcpu, sp);
if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
@@ -1894,8 +2021,8 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (sp->role.invalid)
return true;
- /* TDP MMU pages due not use the MMU generation. */
- return !sp->tdp_mmu_page &&
+ /* TDP MMU pages do not use the MMU generation. */
+ return !is_tdp_mmu_page(sp) &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2129,6 +2256,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
/*
* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
* depends on valid pages being added to the head of the list. See
@@ -2327,7 +2456,16 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_page_add_parent_pte(cache, sp, sptep);
- if (sp->unsync_children || sp->unsync)
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
mark_unsync(sptep);
}
@@ -2350,12 +2488,12 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* so we should update the spte at this point to get
* a new sp with the correct access.
*/
- child = to_shadow_page(*sptep & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(*sptep);
if (child->role.access == direct_access)
return;
drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@@ -2371,7 +2509,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
} else {
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, spte);
/*
@@ -2443,6 +2581,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
{
bool list_unstable, zapped_root = false;
+ lockdep_assert_held_write(&kvm->mmu_lock);
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
@@ -2486,8 +2625,8 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
zapped_root = !is_obsolete_sp(kvm, sp);
}
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ if (sp->nx_huge_page_disallowed)
+ unaccount_nx_huge_page(kvm, sp);
sp->role.invalid = 1;
@@ -2810,7 +2949,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
struct kvm_mmu_page *child;
u64 pte = *sptep;
- child = to_shadow_page(pte & SPTE_BASE_ADDR_MASK);
+ child = spte_to_child_sp(pte);
drop_parent_pte(child, sptep);
flush = true;
} else if (pfn != spte_to_pfn(*sptep)) {
@@ -2838,8 +2977,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (flush)
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -3084,13 +3222,14 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
if (cur_level > PG_LEVEL_4K &&
cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
- !is_large_pte(spte)) {
+ !is_large_pte(spte) &&
+ spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
- * A small SPTE exists for this pfn, but FNAME(fetch)
- * and __direct_map would like to create a large PTE
- * instead: just force them to go down another level,
- * patching back for them into pfn the next 9 bits of
- * the address.
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
*/
u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
KVM_PAGES_PER_HPAGE(cur_level - 1);
@@ -3099,7 +3238,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
@@ -3117,7 +3256,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -3126,9 +3265,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->is_tdp && fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -3143,55 +3282,62 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return ret;
}
-static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
{
- send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
+ if (is_sigpending_pfn(fault->pfn)) {
+ kvm_handle_signal_exit(vcpu);
+ return -EINTR;
+ }
+
/*
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
*/
- if (pfn == KVM_PFN_ERR_RO_FAULT)
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
return RET_PF_EMULATE;
- if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
return RET_PF_RETRY;
}
return -EFAULT;
}
-static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- unsigned int access)
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
{
- /* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
- if (unlikely(!fault->slot)) {
- gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
- vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
- access & shadow_mmio_access_mask);
- /*
- * If MMIO caching is disabled, emulate immediately without
- * touching the shadow page tables as attempting to install an
- * MMIO SPTE will just be an expensive nop. Do not cache MMIO
- * whose gfn is greater than host.MAXPHYADDR, any guest that
- * generates such gfns is running nested and is being tricked
- * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
- * and only if L1's MAXPHYADDR is inaccurate with respect to
- * the hardware's).
- */
- if (unlikely(!enable_mmio_caching) ||
- unlikely(fault->gfn > kvm_mmu_max_gfn()))
- return RET_PF_EMULATE;
- }
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
return RET_PF_CONTINUE;
}
@@ -3236,9 +3382,9 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
* Returns true if the SPTE was fixed successfully. Otherwise,
* someone else modified the SPTE from its original value.
*/
-static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- u64 *sptep, u64 old_spte, u64 new_spte)
+static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ u64 *sptep, u64 old_spte, u64 new_spte)
{
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
@@ -3315,7 +3461,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
do {
u64 new_spte;
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (tdp_mmu_enabled)
sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3398,8 +3544,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
if (++retry_count > 4) {
- printk_once(KERN_WARNING
- "kvm: Fast #PF retrying more than 4 times.\n");
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
break;
}
@@ -3422,7 +3567,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (!VALID_PAGE(*root_hpa))
return;
- sp = to_shadow_page(*root_hpa & SPTE_BASE_ADDR_MASK);
+ /*
+ * The "root" may be a special root, e.g. a PAE entry, treat it as a
+ * SPTE to ensure any non-PA bits are dropped.
+ */
+ sp = spte_to_child_sp(*root_hpa);
if (WARN_ON(!sp))
return;
@@ -3442,6 +3591,8 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
LIST_HEAD(invalid_list);
bool free_active_root;
+ WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
+
BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
/* Before acquiring the MMU lock, see if we need to do any real work. */
@@ -3557,7 +3708,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_unlock;
- if (is_tdp_mmu_enabled(vcpu->kvm)) {
+ if (tdp_mmu_enabled) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
@@ -3660,7 +3811,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
int quadrant, i, r;
hpa_t root;
- root_pgd = mmu->get_guest_pgd(vcpu);
+ root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
root_gfn = root_pgd >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
@@ -3907,8 +4058,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->pae_root[i];
if (IS_VALID_PAE_ROOT(root)) {
- root &= SPTE_BASE_ADDR_MASK;
- sp = to_shadow_page(root);
+ sp = spte_to_child_sp(root);
mmu_sync_children(vcpu, sp, true);
}
}
@@ -3988,7 +4138,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
walk_shadow_page_lockless_begin(vcpu);
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (is_tdp_mmu_active(vcpu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
@@ -4111,7 +4261,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
- arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
+ arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
@@ -4130,13 +4280,13 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
return;
if (!vcpu->arch.mmu->root_role.direct &&
- work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
+ work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
}
-static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
bool async;
@@ -4169,7 +4319,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
fault->write, &fault->map_writable,
&fault->hva);
if (!async)
@@ -4186,18 +4336,44 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
}
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ /*
+ * Allow gup to bail on pending non-fatal signals when it's also allowed
+ * to wait for IO. Note, gup always bails if it is unable to quickly
+ * get a page and a fatal signal, i.e. SIGKILL, is pending.
+ */
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
fault->write, &fault->map_writable,
&fault->hva);
return RET_PF_CONTINUE;
}
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ int ret;
+
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ ret = __kvm_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (unlikely(!fault->slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ return RET_PF_CONTINUE;
+}
+
/*
* Returns true if the page fault is stale and needs to be retried, i.e. if the
* root was invalidated by a memslot update or a relevant mmu_notifier fired.
*/
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault, int mmu_seq)
+ struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
@@ -4217,19 +4393,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
return true;
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+ mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
-
- unsigned long mmu_seq;
int r;
- fault->gfn = fault->addr >> PAGE_SHIFT;
- fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
-
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@@ -4241,41 +4411,24 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
- if (is_tdp_mmu_fault)
- read_lock(&vcpu->kvm->mmu_lock);
- else
- write_lock(&vcpu->kvm->mmu_lock);
-
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ if (is_page_fault_stale(vcpu, fault))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- if (is_tdp_mmu_fault)
- r = kvm_tdp_mmu_map(vcpu, fault);
- else
- r = __direct_map(vcpu, fault);
+ r = direct_map(vcpu, fault);
out_unlock:
- if (is_tdp_mmu_fault)
- read_unlock(&vcpu->kvm->mmu_lock);
- else
- write_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -4323,6 +4476,42 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_EMULATE;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ read_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+#endif
+
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
/*
@@ -4340,13 +4529,19 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
+ gfn_t base = gfn_round_for_level(fault->gfn,
+ fault->max_level);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
}
}
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
return direct_page_fault(vcpu, fault);
}
@@ -4354,8 +4549,7 @@ static void nonpaging_init_context(struct kvm_mmu *context)
{
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
+ context->sync_spte = NULL;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4451,10 +4645,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role new_role = mmu->root_role;
- if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
- /* kvm_mmu_ensure_valid_pgd will set up a new root. */
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
return;
- }
/*
* It's possible that the cached previous root page is obsolete because
@@ -4487,11 +4683,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
}
EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
-static unsigned long get_cr3(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr3(vcpu);
-}
-
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
@@ -4521,10 +4712,9 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
#include "paging_tmpl.h"
#undef PTTYPE
-static void
-__reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, int level, bool nx, bool gbpages,
- bool pse, bool amd)
+static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, int level, bool nx,
+ bool gbpages, bool pse, bool amd)
{
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
@@ -4637,9 +4827,9 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
guest_cpuid_is_amd_or_hygon(vcpu));
}
-static void
-__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
- u64 pa_bits_rsvd, bool execonly, int huge_page_level)
+static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ u64 pa_bits_rsvd, bool execonly,
+ int huge_page_level)
{
u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51);
u64 large_1g_rsvd = 0, large_2m_rsvd = 0;
@@ -4739,8 +4929,7 @@ static inline bool boot_cpu_is_amd(void)
* the direct page table on host, use as much mmu features as
* possible, however, kvm currently does not do execution-protection.
*/
-static void
-reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
+static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
{
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4943,20 +5132,18 @@ static void paging64_init_context(struct kvm_mmu *context)
{
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->sync_page = paging64_sync_page;
- context->invlpg = paging64_invlpg;
+ context->sync_spte = paging64_sync_spte;
}
static void paging32_init_context(struct kvm_mmu *context)
{
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
- context->sync_page = paging32_sync_page;
- context->invlpg = paging32_invlpg;
+ context->sync_spte = paging32_sync_spte;
}
-static union kvm_cpu_role
-kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
+static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
+ const struct kvm_mmu_role_regs *regs)
{
union kvm_cpu_role role = {0};
@@ -4995,6 +5182,21 @@ kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
return role;
}
+void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu)
+{
+ const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
+
+ BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
+ BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
+
+ if (is_cr0_wp(mmu) == cr0_wp)
+ return;
+
+ mmu->cpu_role.base.cr0_wp = cr0_wp;
+ reset_guest_paging_metadata(vcpu, mmu);
+}
+
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
/* tdp_root_level is architecture forced level, use it if nonzero */
@@ -5040,9 +5242,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
context->cpu_role.as_u64 = cpu_role.as_u64;
context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
- context->sync_page = nonpaging_sync_page;
- context->invlpg = NULL;
- context->get_guest_pgd = get_cr3;
+ context->sync_spte = NULL;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
@@ -5172,8 +5373,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
- context->sync_page = ept_sync_page;
- context->invlpg = ept_invlpg;
+ context->sync_spte = ept_sync_spte;
update_permission_bitmask(context, true);
context->pkru_mask = 0;
@@ -5192,7 +5392,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
kvm_init_shadow_mmu(vcpu, cpu_role);
- context->get_guest_pgd = get_cr3;
+ context->get_guest_pgd = get_guest_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
@@ -5206,7 +5406,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
return;
g_context->cpu_role.as_u64 = new_mode.as_u64;
- g_context->get_guest_pgd = get_cr3;
+ g_context->get_guest_pgd = get_guest_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
@@ -5214,7 +5414,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
* L2 page tables are never shadowed, so there is no need to sync
* SPTEs.
*/
- g_context->invlpg = NULL;
+ g_context->sync_spte = NULL;
/*
* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
@@ -5276,7 +5476,7 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* Changing guest CPUID after KVM_RUN is forbidden, see the comment in
* kvm_arch_vcpu_ioctl().
*/
- KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
+ KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5547,7 +5747,8 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false);
+ lower_32_bits(error_code), false,
+ &emulation_type);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
@@ -5589,48 +5790,77 @@ emulate:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
-void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t gva, hpa_t root_hpa)
+static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, hpa_t root_hpa)
+{
+ struct kvm_shadow_walk_iterator iterator;
+
+ vcpu_clear_mmio_info(vcpu, addr);
+
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ write_lock(&vcpu->kvm->mmu_lock);
+ for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
+ struct kvm_mmu_page *sp = sptep_to_sp(iterator.sptep);
+
+ if (sp->unsync) {
+ int ret = kvm_sync_spte(vcpu, sp, iterator.index);
+
+ if (ret < 0)
+ mmu_page_zap_pte(vcpu->kvm, sp, iterator.sptep, NULL);
+ if (ret)
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, iterator.sptep);
+ }
+
+ if (!sp->unsync_children)
+ break;
+ }
+ write_unlock(&vcpu->kvm->mmu_lock);
+}
+
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ u64 addr, unsigned long roots)
{
int i;
+ WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
+
/* It's actually a GPA for vcpu->arch.guest_mmu. */
if (mmu != &vcpu->arch.guest_mmu) {
/* INVLPG on a non-canonical address is a NOP according to the SDM. */
- if (is_noncanonical_address(gva, vcpu))
+ if (is_noncanonical_address(addr, vcpu))
return;
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
+ static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
}
- if (!mmu->invlpg)
+ if (!mmu->sync_spte)
return;
- if (root_hpa == INVALID_PAGE) {
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
+ if (roots & KVM_MMU_ROOT_CURRENT)
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->root.hpa);
- /*
- * INVLPG is required to invalidate any global mappings for the VA,
- * irrespective of PCID. Since it would take us roughly similar amount
- * of work to determine whether any of the prev_root mappings of the VA
- * is marked global, or to just sync it blindly, so we might as well
- * just always sync it.
- *
- * Mappings not reachable via the current cr3 or the prev_roots will be
- * synced when switching to that cr3, so nothing needs to be done here
- * for them.
- */
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (VALID_PAGE(mmu->prev_roots[i].hpa))
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- } else {
- mmu->invlpg(vcpu, gva, root_hpa);
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (roots & KVM_MMU_ROOT_PREVIOUS(i))
+ __kvm_mmu_invalidate_addr(vcpu, mmu, addr, mmu->prev_roots[i].hpa);
}
}
+EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
+ /*
+ * INVLPG is required to invalidate any global mappings for the VA,
+ * irrespective of PCID. Blindly sync all roots as it would take
+ * roughly the same amount of work/time to determine whether any of the
+ * previous roots have a global mapping.
+ *
+ * Mappings not reachable via the current or previous cached roots will
+ * be synced when switching to that new cr3, so nothing needs to be
+ * done here for them.
+ */
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5639,27 +5869,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- bool tlb_flush = false;
+ unsigned long roots = 0;
uint i;
- if (pcid == kvm_get_active_pcid(vcpu)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->root.hpa);
- tlb_flush = true;
- }
+ if (pcid == kvm_get_active_pcid(vcpu))
+ roots |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
- pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- if (mmu->invlpg)
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
- tlb_flush = true;
- }
+ pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
- if (tlb_flush)
- static_call(kvm_x86_flush_tlb_gva)(vcpu, gva);
-
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
++vcpu->stat.invlpg;
/*
@@ -5676,6 +5899,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
/*
* max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
@@ -5693,29 +5919,30 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm,
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
const struct kvm_memory_slot *slot);
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
- bool flush)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool flush_on_yield, bool flush)
{
struct slot_rmap_walk_iterator iterator;
- for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, memslot);
+ flush |= fn(kvm, iterator.rmap, slot);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_with_address(kvm,
- start_gfn,
- iterator.gfn - start_gfn + 1);
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
flush = false;
}
cond_resched_rwlock_write(&kvm->mmu_lock);
@@ -5725,23 +5952,23 @@ slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
return flush;
}
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
{
- return slot_handle_level_range(kvm, memslot, fn, start_level,
- end_level, memslot->base_gfn,
- memslot->base_gfn + memslot->npages - 1,
- flush_on_yield, false);
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ flush_on_yield, false);
}
-static __always_inline bool
-slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool flush_on_yield)
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
{
- return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
- PG_LEVEL_4K, flush_on_yield);
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
}
static void free_mmu_pages(struct kvm_mmu *mmu)
@@ -5923,7 +6150,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_invalidate_all_roots(kvm);
/*
@@ -5948,7 +6175,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Deferring the zap until the final reference to the root is put would
* lead to use-after-free.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
@@ -5971,12 +6198,14 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
- INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- r = kvm_mmu_init_tdp_mmu(kvm);
- if (r < 0)
- return r;
+ if (tdp_mmu_enabled) {
+ r = kvm_mmu_init_tdp_mmu(kvm);
+ if (r < 0)
+ return r;
+ }
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -6006,7 +6235,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_page_track_unregister_notifier(kvm, node);
- kvm_mmu_uninit_tdp_mmu(kvm);
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
mmu_free_vm_memory_caches(kvm);
}
@@ -6033,9 +6263,9 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = slot_handle_level_range(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
}
}
@@ -6060,15 +6290,14 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
gfn_end, true, flush);
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
+ kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start);
kvm_mmu_invalidate_end(kvm, 0, -1ul);
@@ -6088,12 +6317,12 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
{
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- slot_handle_level(kvm, memslot, slot_rmap_write_protect,
- start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
+ walk_slot_rmaps(kvm, memslot, slot_rmap_write_protect,
+ start_level, KVM_MAX_HUGEPAGE_LEVEL, false);
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
read_unlock(&kvm->mmu_lock);
@@ -6324,10 +6553,9 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
* all the way to the target level. There's no need to split pages
* already at the target level.
*/
- for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
- slot_handle_level_range(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
- }
+ for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
+ __walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
+ level, level, start, end - 1, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6336,7 +6564,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
u64 start, u64 end,
int target_level)
{
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm))
@@ -6357,7 +6585,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
u64 start = memslot->base_gfn;
u64 end = start + memslot->npages;
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm)) {
@@ -6406,9 +6634,8 @@ restart:
PG_LEVEL_NUM)) {
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- if (kvm_available_flush_tlb_with_range())
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ if (kvm_available_flush_remote_tlbs_range())
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
else
need_tlb_flush = 1;
@@ -6426,8 +6653,8 @@ static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
* pages that are already mapped at the maximum hugepage level.
*/
- if (slot_handle_level(kvm, slot, kvm_mmu_zap_collapsible_spte,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
+ if (walk_slot_rmaps(kvm, slot, kvm_mmu_zap_collapsible_spte,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - 1, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
}
@@ -6440,7 +6667,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
@@ -6458,8 +6685,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
* is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
- kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
- memslot->npages);
+ kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
@@ -6471,11 +6697,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
* Clear dirty bits only on 4k SPTEs since the legacy MMU only
* support dirty logging at a 4k granularity.
*/
- slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
+ walk_slot_rmaps_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
read_unlock(&kvm->mmu_lock);
@@ -6510,7 +6736,7 @@ restart:
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_all(kvm);
write_unlock(&kvm->mmu_lock);
@@ -6536,13 +6762,13 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* zap all shadow pages.
*/
if (unlikely(gen == 0)) {
- kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_zap_all_fast(kvm);
}
}
-static unsigned long
-mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct kvm *kvm;
int nr_to_scan = sc->nr_to_scan;
@@ -6600,8 +6826,8 @@ unlock:
return freed;
}
-static unsigned long
-mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long mmu_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
}
@@ -6641,7 +6867,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
new_val = 1;
else if (sysfs_streq(val, "auto"))
new_val = get_nx_auto_mode();
- else if (strtobool(val, &new_val) < 0)
+ else if (kstrtobool(val, &new_val) < 0)
return -EINVAL;
__set_nx_huge_pages(new_val);
@@ -6656,7 +6882,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
kvm_mmu_zap_all_fast(kvm);
mutex_unlock(&kvm->slots_lock);
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
}
mutex_unlock(&kvm_lock);
}
@@ -6675,6 +6901,13 @@ void __init kvm_mmu_x86_module_init(void)
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
kvm_mmu_spte_module_init();
}
@@ -6788,7 +7021,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
- wake_up_process(kvm->arch.nx_lpage_recovery_thread);
+ wake_up_process(kvm->arch.nx_huge_page_recovery_thread);
mutex_unlock(&kvm_lock);
}
@@ -6796,9 +7029,10 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
return err;
}
-static void kvm_recover_nx_lpages(struct kvm *kvm)
+static void kvm_recover_nx_huge_pages(struct kvm *kvm)
{
unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
+ struct kvm_memory_slot *slot;
int rcu_idx;
struct kvm_mmu_page *sp;
unsigned int ratio;
@@ -6819,24 +7053,58 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0;
for ( ; to_zap; --to_zap) {
- if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages))
+ if (list_empty(&kvm->arch.possible_nx_huge_pages))
break;
/*
* We use a separate list instead of just using active_mmu_pages
- * because the number of lpage_disallowed pages is expected to
- * be relatively small compared to the total.
+ * because the number of shadow pages that be replaced with an
+ * NX huge page is expected to be relatively small compared to
+ * the total number of shadow pages. And because the TDP MMU
+ * doesn't use active_mmu_pages.
*/
- sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
+ sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
struct kvm_mmu_page,
- lpage_disallowed_link);
- WARN_ON_ONCE(!sp->lpage_disallowed);
- if (is_tdp_mmu_page(sp)) {
+ possible_nx_huge_page_link);
+ WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
+ WARN_ON_ONCE(!sp->role.direct);
+
+ /*
+ * Unaccount and do not attempt to recover any NX Huge Pages
+ * that are being dirty tracked, as they would just be faulted
+ * back in as 4KiB pages. The NX Huge Pages in this slot will be
+ * recovered, along with all the other huge pages in the slot,
+ * when dirty logging is disabled.
+ *
+ * Since gfn_to_memslot() is relatively expensive, it helps to
+ * skip it if it the test cannot possibly return true. On the
+ * other hand, if any memslot has logging enabled, chances are
+ * good that all of them do, in which case unaccount_nx_huge_page()
+ * is much cheaper than zapping the page.
+ *
+ * If a memslot update is in progress, reading an incorrect value
+ * of kvm->nr_memslots_dirty_logging is not a problem: if it is
+ * becoming zero, gfn_to_memslot() will be done unnecessarily; if
+ * it is becoming nonzero, the page will be zapped unnecessarily.
+ * Either way, this only affects efficiency in racy situations,
+ * and not correctness.
+ */
+ slot = NULL;
+ if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
+ struct kvm_memslots *slots;
+
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ WARN_ON_ONCE(!slot);
+ }
+
+ if (slot && kvm_slot_dirty_track_enabled(slot))
+ unaccount_nx_huge_page(kvm, sp);
+ else if (is_tdp_mmu_page(sp))
flush |= kvm_tdp_mmu_zap_sp(kvm, sp);
- } else {
+ else
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- WARN_ON_ONCE(sp->lpage_disallowed);
- }
+ WARN_ON_ONCE(sp->nx_huge_page_disallowed);
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
@@ -6856,7 +7124,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, rcu_idx);
}
-static long get_nx_lpage_recovery_timeout(u64 start_time)
+static long get_nx_huge_page_recovery_timeout(u64 start_time)
{
bool enabled;
uint period;
@@ -6867,19 +7135,19 @@ static long get_nx_lpage_recovery_timeout(u64 start_time)
: MAX_SCHEDULE_TIMEOUT;
}
-static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
+static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
{
u64 start_time;
long remaining_time;
while (true) {
start_time = get_jiffies_64();
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && remaining_time > 0) {
schedule_timeout(remaining_time);
- remaining_time = get_nx_lpage_recovery_timeout(start_time);
+ remaining_time = get_nx_huge_page_recovery_timeout(start_time);
set_current_state(TASK_INTERRUPTIBLE);
}
@@ -6888,7 +7156,7 @@ static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
if (kthread_should_stop())
return 0;
- kvm_recover_nx_lpages(kvm);
+ kvm_recover_nx_huge_pages(kvm);
}
}
@@ -6896,17 +7164,17 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
{
int err;
- err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
+ err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0,
"kvm-nx-lpage-recovery",
- &kvm->arch.nx_lpage_recovery_thread);
+ &kvm->arch.nx_huge_page_recovery_thread);
if (!err)
- kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
+ kthread_unpark(kvm->arch.nx_huge_page_recovery_thread);
return err;
}
void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
{
- if (kvm->arch.nx_lpage_recovery_thread)
- kthread_stop(kvm->arch.nx_lpage_recovery_thread);
+ if (kvm->arch.nx_huge_page_recovery_thread)
+ kthread_stop(kvm->arch.nx_huge_page_recovery_thread);
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 582def531d4d..d39af5639ce9 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -57,7 +57,13 @@ struct kvm_mmu_page {
bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool lpage_disallowed; /* Can't be replaced by an equiv large page */
+
+ /*
+ * The shadow page can't be replaced by an equivalent huge page
+ * because it is being used to map an executable page in the guest
+ * and the NX huge page mitigation is enabled.
+ */
+ bool nx_huge_page_disallowed;
/*
* The following two entries are used to key the shadow page in the
@@ -100,7 +106,14 @@ struct kvm_mmu_page {
};
};
- struct list_head lpage_disallowed_link;
+ /*
+ * Tracks shadow pages that, if zapped, would allow KVM to create an NX
+ * huge page. A shadow page will have nx_huge_page_disallowed set but
+ * not be on the list if a huge page is disallowed for other reasons,
+ * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
+ * isn't properly aligned, etc...
+ */
+ struct list_head possible_nx_huge_page_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -120,18 +133,6 @@ struct kvm_mmu_page {
extern struct kmem_cache *mmu_page_header_cache;
-static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
-{
- struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
-
- return (struct kvm_mmu_page *)page_private(page);
-}
-
-static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
-{
- return to_shadow_page(__pa(sptep));
-}
-
static inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
{
return role.smm ? 1 : 0;
@@ -155,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
gfn_t gfn, bool can_unsync, bool prefetch);
@@ -163,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
-void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
- u64 start_gfn, u64 pages);
+
+void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t nr_pages);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_range(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
extern int nx_huge_pages;
@@ -198,7 +213,7 @@ struct kvm_page_fault {
/*
* Maximum page size that can be created for this fault; input to
- * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
*/
u8 max_level;
@@ -221,9 +236,17 @@ struct kvm_page_fault {
struct kvm_memory_slot *slot;
/* Outputs of kvm_faultin_pfn. */
+ unsigned long mmu_seq;
kvm_pfn_t pfn;
hva_t hva;
bool map_writable;
+
+ /*
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables.
+ */
+ bool write_fault_to_shadow_pgtable;
};
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
@@ -257,7 +280,7 @@ enum {
};
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch)
+ u32 err, bool prefetch, int *emulation_type)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -278,6 +301,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
};
int r;
+ if (vcpu->arch.mmu->root_role.direct) {
+ fault.gfn = fault.addr >> PAGE_SHIFT;
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
/*
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
* guest perspective and have already been counted at the time of the
@@ -291,6 +319,9 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ if (fault.write_fault_to_shadow_pgtable && emulation_type)
+ *emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
+
/*
* Similar to above, prefetch faults aren't truly spurious, and the
* async #PF path doesn't do emulation. Do count faults that are fixed
@@ -315,7 +346,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f..0a2ac438d647 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -10,6 +10,7 @@
* Author:
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/rculist.h>
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5ab5f94dcb6f..0662e0278e70 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -324,7 +324,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
walker->level = mmu->cpu_role.base.level;
- pte = mmu->get_guest_pgd(vcpu);
+ pte = kvm_mmu_get_guest_pgd(vcpu, mmu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64
@@ -519,7 +519,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, pt_element_t gpte, bool no_dirty_log)
+ u64 *spte, pt_element_t gpte)
{
struct kvm_memory_slot *slot;
unsigned pte_access;
@@ -535,8 +535,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
- no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK);
if (!slot)
return false;
@@ -605,7 +604,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (is_shadow_present_pte(*spte))
continue;
- if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i]))
break;
}
}
@@ -642,12 +641,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, fault->addr);
- shadow_walk_okay(&it) && it.level > gw->level;
- shadow_walk_next(&it)) {
+ for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
@@ -685,15 +684,22 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
+
+ if (fault->write && table_gfn == fault->gfn)
+ fault->write_fault_to_shadow_pgtable = true;
}
+ /*
+ * Adjust the hugepage size _after_ resolving indirect shadow pages.
+ * KVM doesn't support mapping hugepages into the guest for gfns that
+ * are being shadowed by KVM, i.e. allocating a new shadow page may
+ * affect the allowed hugepage size.
+ */
kvm_mmu_hugepage_adjust(vcpu, fault);
trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
- clear_sp_write_flooding_count(it.sptep);
-
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
@@ -701,7 +707,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -713,9 +719,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
continue;
link_shadow_page(vcpu, it.sptep, sp);
- if (fault->huge_page_disallowed &&
- fault->req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
+ if (fault->huge_page_disallowed)
+ account_nx_huge_page(vcpu->kvm, sp,
+ fault->req_level >= it.level);
}
if (WARN_ON_ONCE(it.level != fault->goal_level))
@@ -733,46 +739,6 @@ out_gpte_changed:
return RET_PF_RETRY;
}
- /*
- * To see whether the mapped gfn can write its page table in the current
- * mapping.
- *
- * It is the helper function of FNAME(page_fault). When guest uses large page
- * size to map the writable gfn which is used as current page table, we should
- * force kvm to use small page size to map it because new shadow page will be
- * created when kvm establishes shadow page table that stop kvm using large
- * page size. Do it early can avoid unnecessary #PF and emulation.
- *
- * @write_fault_to_shadow_pgtable will return true if the fault gfn is
- * currently used as its page table.
- *
- * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
- * since the PDPT is always shadowed, that means, we can not use large page
- * size to map the gfn which is used as PDPT.
- */
-static bool
-FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker, bool user_fault,
- bool *write_fault_to_shadow_pgtable)
-{
- int level;
- gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
- bool self_changed = false;
-
- if (!(walker->pte_access & ACC_WRITE_MASK ||
- (!is_cr0_wp(vcpu->arch.mmu) && !user_fault)))
- return false;
-
- for (level = walker->level; level <= walker->max_level; level++) {
- gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
-
- self_changed |= !(gfn & mask);
- *write_fault_to_shadow_pgtable |= !gfn;
- }
-
- return self_changed;
-}
-
/*
* Page fault handler. There are several causes for a page fault:
* - there is no shadow pte for the guest pte
@@ -791,8 +757,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
struct guest_walker walker;
int r;
- unsigned long mmu_seq;
- bool is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
WARN_ON_ONCE(fault->is_tdp);
@@ -817,6 +781,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
}
fault->gfn = walker.gfn;
+ fault->max_level = walker.level;
fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
if (page_fault_handle_page_track(vcpu, fault)) {
@@ -828,24 +793,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- vcpu->arch.write_fault_to_shadow_pgtable = false;
-
- is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
-
- if (is_self_change_mapping)
- fault->max_level = PG_LEVEL_4K;
- else
- fault->max_level = walker.level;
-
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
+ r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -871,7 +819,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ if (is_page_fault_stale(vcpu, fault))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
@@ -897,65 +845,6 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
}
-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
-{
- struct kvm_shadow_walk_iterator iterator;
- struct kvm_mmu_page *sp;
- u64 old_spte;
- int level;
- u64 *sptep;
-
- vcpu_clear_mmio_info(vcpu, gva);
-
- /*
- * No need to check return value here, rmap_can_add() can
- * help us to skip pte prefetch later.
- */
- mmu_topup_memory_caches(vcpu, true);
-
- if (!VALID_PAGE(root_hpa)) {
- WARN_ON(1);
- return;
- }
-
- write_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
-
- sp = sptep_to_sp(sptep);
- old_spte = *sptep;
- if (is_last_spte(old_spte, level)) {
- pt_element_t gpte;
- gpa_t pte_gpa;
-
- if (!sp->unsync)
- break;
-
- pte_gpa = FNAME(get_level1_sp_gpa)(sp);
- pte_gpa += spte_index(sptep) * sizeof(pt_element_t);
-
- mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
- if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm,
- sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
-
- if (!rmap_can_add(vcpu))
- break;
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- break;
-
- FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
- }
-
- if (!sp->unsync_children)
- break;
- }
- write_unlock(&vcpu->kvm->mmu_lock);
-}
-
/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gpa_t addr, u64 access,
@@ -988,114 +877,75 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* can't change unless all sptes pointing to it are nuked first.
*
* Returns
- * < 0: the sp should be zapped
- * 0: the sp is synced and no tlb flushing is required
- * > 0: the sp is synced and tlb flushing is required
+ * < 0: failed to sync spte
+ * 0: the spte is synced and no tlb flushing is required
+ * > 0: the spte is synced and tlb flushing is required
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
- int i;
bool host_writable;
gpa_t first_pte_gpa;
- bool flush = false;
-
- /*
- * Ignore various flags when verifying that it's safe to sync a shadow
- * page using the current MMU context.
- *
- * - level: not part of the overall MMU role and will never match as the MMU's
- * level tracks the root level
- * - access: updated based on the new guest PTE
- * - quadrant: not part of the overall MMU role (similar to level)
- */
- const union kvm_mmu_page_role sync_role_ign = {
- .level = 0xf,
- .access = 0x7,
- .quadrant = 0x3,
- .passthrough = 0x1,
- };
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
+ unsigned pte_access;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
+ gfn_t gfn;
- /*
- * Direct pages can never be unsync, and KVM should never attempt to
- * sync a shadow page for a different MMU context, e.g. if the role
- * differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
- * reserved bits checks will be wrong, etc...
- */
- if (WARN_ON_ONCE(sp->role.direct ||
- (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
- return -1;
+ if (WARN_ON_ONCE(!sp->spt[i]))
+ return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
+ pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
- u64 *sptep, spte;
- struct kvm_memory_slot *slot;
- unsigned pte_access;
- pt_element_t gpte;
- gpa_t pte_gpa;
- gfn_t gfn;
-
- if (!sp->spt[i])
- continue;
-
- pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
-
- if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
- sizeof(pt_element_t)))
- return -1;
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- flush = true;
- continue;
- }
-
- gfn = gpte_to_gfn(gpte);
- pte_access = sp->role.access;
- pte_access &= FNAME(gpte_access)(gpte);
- FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return -1;
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
- continue;
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ return 1;
- /*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
- */
- if ((!pte_access && !shadow_present_mask) ||
- gfn != kvm_mmu_page_get_gfn(sp, i)) {
- drop_spte(vcpu->kvm, &sp->spt[i]);
- flush = true;
- continue;
- }
-
- /* Update the shadowed access bits in case they changed. */
- kvm_mmu_page_set_access(sp, i, pte_access);
+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(gpte);
+ FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- sptep = &sp->spt[i];
- spte = *sptep;
- host_writable = spte & shadow_host_writable_mask;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- make_spte(vcpu, sp, slot, pte_access, gfn,
- spte_to_pfn(spte), spte, true, false,
- host_writable, &spte);
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
+ return 0;
- flush |= mmu_spte_update(sptep, spte);
+ /*
+ * Drop the SPTE if the new protections would result in a RWX=0
+ * SPTE or if the gfn is changing. The RWX=0 case only affects
+ * EPT with execute-only support, i.e. EPT without an effective
+ * "present" bit, as all other paging modes will create a
+ * read-only SPTE if pte_access is zero.
+ */
+ if ((!pte_access && !shadow_present_mask) ||
+ gfn != kvm_mmu_page_get_gfn(sp, i)) {
+ drop_spte(vcpu->kvm, &sp->spt[i]);
+ return 1;
}
-
/*
- * Note, any flush is purely for KVM's correctness, e.g. when dropping
- * an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
- * unmap or dirty logging event doesn't fail to flush. The guest is
- * responsible for flushing the TLB to ensure any changes in protection
- * bits are recognized, i.e. until the guest flushes or page faults on
- * a relevant address, KVM is architecturally allowed to let vCPUs use
- * cached translations with the old protection bits.
+ * Do nothing if the permissions are unchanged. The existing SPTE is
+ * still, and prefetch_invalid_gpte() has verified that the A/D bits
+ * are set in the "new" gPTE, i.e. there is no danger of missing an A/D
+ * update due to A/D bits being set in the SPTE but not the gPTE.
*/
- return flush;
+ if (kvm_mmu_page_get_access(sp, i) == pte_access)
+ return 0;
+
+ /* Update the shadowed access bits in case they changed. */
+ kvm_mmu_page_set_access(sp, i, pte_access);
+
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
+
+ return mmu_spte_update(sptep, spte);
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 2e08b2a45361..cf2c6426a6fc 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -7,7 +7,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2020 Red Hat, Inc. and/or its affiliates.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "mmu.h"
@@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
WARN_ON_ONCE(!pte_access && !shadow_present_mask);
if (sp->role.ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
- spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -161,6 +161,18 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
+ /*
+ * For simplicity, enforce the NX huge page mitigation even if not
+ * strictly necessary. KVM could ignore the mitigation if paging is
+ * disabled in the guest, as the guest doesn't have any page tables to
+ * abuse. But to safely ignore the mitigation, KVM would have to
+ * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG
+ * is toggled on, and that's a net negative for performance when TDP is
+ * enabled. When TDP is disabled, KVM will always switch to a new MMU
+ * when CR0.PG is toggled, but leveraging that to ignore the mitigation
+ * would tie make_spte() further to vCPU/MMU state, and add complexity
+ * just to optimize a mode that is anything but performance critical.
+ */
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
@@ -305,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
shadow_user_mask | shadow_x_mask | shadow_me_value;
if (ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else
spte |= shadow_accessed_mask;
@@ -340,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte)
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
- "kvm: Access Tracking saved bit locations are not zero\n");
+ "Access Tracking saved bit locations are not zero\n");
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 7670c13ce251..1279db2eab44 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -28,10 +28,10 @@
*/
#define SPTE_TDP_AD_SHIFT 52
#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
-static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -188,7 +188,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
* should not modify the SPTE.
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
- * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
+ * both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
* vulnerability. Use only low bits to avoid 64-bit immediates.
*
* Only used by the TDP MMU.
@@ -219,6 +219,23 @@ static inline int spte_index(u64 *sptep)
*/
extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
+static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page)
+{
+ struct page *page = pfn_to_page((shadow_page) >> PAGE_SHIFT);
+
+ return (struct kvm_mmu_page *)page_private(page);
+}
+
+static inline struct kvm_mmu_page *spte_to_child_sp(u64 spte)
+{
+ return to_shadow_page(spte & SPTE_BASE_ADDR_MASK);
+}
+
+static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep)
+{
+ return to_shadow_page(__pa(sptep));
+}
+
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
@@ -249,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
- * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
* TDP and do the A/D type check unconditionally.
*/
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -346,7 +363,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
* A shadow-present leaf SPTE may be non-writable for 4 possible reasons:
*
* 1. To intercept writes for dirty logging. KVM write-protects huge pages
- * so that they can be split be split down into the dirty logging
+ * so that they can be split down into the dirty logging
* granularity (4KiB) whenever the guest writes to them. KVM also
* write-protects 4KiB pages so that writes can be recorded in the dirty log
* (e.g. if not using PML). SPTEs are write-protected for dirty logging
@@ -418,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte)
{
if (spte & shadow_mmu_writable_mask)
WARN_ONCE(!(spte & shadow_host_writable_mask),
- "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
spte);
else
WARN_ONCE(is_writable_pte(spte),
- "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
}
static inline bool is_mmu_writable_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 39b48e7d7d1a..d2eb0d4f8710 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu_internal.h"
#include "tdp_iter.h"
@@ -15,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
-static gfn_t round_gfn_for_level(gfn_t gfn, int level)
-{
- return gfn & -KVM_PAGES_PER_HPAGE(level);
-}
-
/*
* Return the TDP iterator to the root PT and allow it to continue its
* traversal over the paging structure from there.
@@ -30,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter)
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
@@ -97,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -139,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter)
return false;
iter->level++;
- iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..fae559559a80 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -29,29 +29,49 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
WRITE_ONCE(*rcu_dereference(sptep), new_spte);
}
+/*
+ * SPTEs must be modified atomically if they are shadow-present, leaf
+ * SPTEs, and have volatile bits, i.e. has bits that can be set outside
+ * of mmu_lock. The Writable bit can be set by KVM's fast page fault
+ * handler, and Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
+{
+ return is_shadow_present_pte(old_spte) &&
+ is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte);
+}
+
static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
u64 new_spte, int level)
{
- /*
- * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
- * volatile bits, i.e. has bits that can be set outside of mmu_lock.
- * The Writable bit can be set by KVM's fast page fault handler, and
- * Accessed and Dirty bits can be set by the CPU.
- *
- * Note, non-leaf SPTEs do have Accessed bits and those bits are
- * technically volatile, but KVM doesn't consume the Accessed bit of
- * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
- * logic needs to be reassessed if KVM were to use non-leaf Accessed
- * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
- */
- if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
- spte_has_volatile_bits(old_spte))
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
__kvm_tdp_mmu_write_spte(sptep, new_spte);
return old_spte;
}
+static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
+ u64 mask, int level)
+{
+ atomic64_t *sptep_atomic;
+
+ if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
+ sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+ return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+ }
+
+ __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
+ return old_spte;
+}
+
/*
* A TDP iterator performs a pre-order walk over a TDP paging structure.
*/
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 672f0432d777..08340219c35a 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu.h"
#include "mmu_internal.h"
@@ -10,26 +11,17 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = true;
-module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-
/* Initializes the TDP MMU for the VM, if enabled. */
int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
struct workqueue_struct *wq;
- if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return 0;
-
wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
if (!wq)
return -ENOMEM;
- /* This should not be changed for the lifetime of the VM. */
- kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
- INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
kvm->arch.tdp_mmu_zap_wq = wq;
return 1;
}
@@ -48,13 +40,20 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
- if (!kvm->arch.tdp_mmu_enabled)
- return;
+ /*
+ * Invalidate all roots, which besides the obvious, schedules all roots
+ * for zapping and thus puts the TDP MMU's reference to each root, i.e.
+ * ultimately frees all roots.
+ */
+ kvm_tdp_mmu_invalidate_all_roots(kvm);
- /* Also waits for any queued work items. */
+ /*
+ * Destroying a workqueue also first flushes the workqueue, i.e. no
+ * need to invoke kvm_tdp_mmu_zap_invalidated_roots().
+ */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
- WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
+ WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
/*
@@ -127,16 +126,6 @@ static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root
queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
}
-static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
-{
- union kvm_mmu_page_role role = page->role;
- role.invalid = true;
-
- /* No need to use cmpxchg, only the invalid bit can change. */
- role.word = xchg(&page->role.word, role.word);
- return role.invalid;
-}
-
void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared)
{
@@ -145,45 +134,12 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
- WARN_ON(!root->tdp_mmu_page);
-
/*
- * The root now has refcount=0. It is valid, but readers already
- * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
- * rejects it. This remains true for the rest of the execution
- * of this function, because readers visit valid roots only
- * (except for tdp_mmu_zap_root_work(), which however
- * does not acquire any reference itself).
- *
- * Even though there are flows that need to visit all roots for
- * correctness, they all take mmu_lock for write, so they cannot yet
- * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
- * since the root still has refcount=0.
- *
- * However, tdp_mmu_zap_root can yield, and writers do not expect to
- * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
- * So the root temporarily gets an extra reference, going to refcount=1
- * while staying invalid. Readers still cannot acquire any reference;
- * but writers are now allowed to run if tdp_mmu_zap_root yields and
- * they might take an extra reference if they themselves yield.
- * Therefore, when the reference is given back by the worker,
- * there is no guarantee that the refcount is still 1. If not, whoever
- * puts the last reference will free the page, but they will not have to
- * zap the root because a root cannot go from invalid to valid.
+ * The TDP MMU itself holds a reference to each root until the root is
+ * explicitly invalidated, i.e. the final reference should be never be
+ * put for a valid root.
*/
- if (!kvm_tdp_root_mark_invalid(root)) {
- refcount_set(&root->tdp_mmu_root_count, 1);
-
- /*
- * Zapping the root in a worker is not just "nice to have";
- * it is required because kvm_tdp_mmu_invalidate_all_roots()
- * skips already-invalid roots. If kvm_tdp_mmu_put_root() did
- * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
- * might return with some roots not zapped yet.
- */
- tdp_mmu_schedule_zap_root(kvm, root);
- return;
- }
+ KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_del_rcu(&root->link);
@@ -284,6 +240,8 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
gfn_t gfn, union kvm_mmu_page_role role)
{
+ INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
+
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
sp->role = role;
@@ -329,7 +287,14 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
root = tdp_mmu_alloc_sp(vcpu);
tdp_mmu_init_sp(root, NULL, 0, role);
- refcount_set(&root->tdp_mmu_root_count, 1);
+ /*
+ * TDP MMU roots are kept until they are explicitly invalidated, either
+ * by a memslot update or by the destruction of the VM. Initialize the
+ * refcount to two; one reference for the vCPU, and one reference for
+ * the TDP MMU itself, which is held until the root is invalidated and
+ * is ultimately put by tdp_mmu_zap_root_work().
+ */
+ refcount_set(&root->tdp_mmu_root_count, 2);
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
@@ -343,43 +308,16 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
u64 old_spte, u64 new_spte, int level,
bool shared);
-static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
-{
- if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
- return;
-
- if (is_accessed_spte(old_spte) &&
- (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
- spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
-}
-
-static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level)
-{
- bool pfn_changed;
- struct kvm_memory_slot *slot;
-
- if (level > PG_LEVEL_4K)
- return;
-
- pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
-
- if ((!is_writable_pte(old_spte) || pfn_changed) &&
- is_writable_pte(new_spte)) {
- slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
- mark_page_dirty_in_slot(kvm, slot, gfn);
- }
-}
-
static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, +1);
+ atomic64_inc(&kvm->arch.tdp_mmu_pages);
}
static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
kvm_account_pgtable_pages((void *)sp->spt, -1);
+ atomic64_dec(&kvm->arch.tdp_mmu_pages);
}
/**
@@ -395,14 +333,17 @@ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
tdp_unaccount_mmu_page(kvm, sp);
+
+ if (!sp->nx_huge_page_disallowed)
+ return;
+
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
lockdep_assert_held_write(&kvm->mmu_lock);
- list_del(&sp->link);
- if (sp->lpage_disallowed)
- unaccount_huge_nx_page(kvm, sp);
+ sp->nx_huge_page_disallowed = false;
+ untrack_possible_nx_huge_page(kvm, sp);
if (shared)
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
@@ -509,7 +450,7 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
/**
- * __handle_changed_spte - handle bookkeeping associated with an SPTE change
+ * handle_changed_spte - handle bookkeeping associated with an SPTE change
* @kvm: kvm instance
* @as_id: the address space of the paging structure the SPTE was a part of
* @gfn: the base GFN that was mapped by the SPTE
@@ -520,12 +461,13 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* the MMU lock and the operation must synchronize with other
* threads that might be modifying SPTEs.
*
- * Handle bookkeeping that might result from the modification of a SPTE.
- * This function must be called for all TDP SPTE modifications.
+ * Handle bookkeeping that might result from the modification of a SPTE. Note,
+ * dirty logging updates are handled in common code, not here (see make_spte()
+ * and fast_pf_fix_direct_spte()).
*/
-static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
+static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
+ u64 old_spte, u64 new_spte, int level,
+ bool shared)
{
bool was_present = is_shadow_present_pte(old_spte);
bool is_present = is_shadow_present_pte(new_spte);
@@ -609,17 +551,10 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
if (was_present && !was_leaf &&
(is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
-}
-static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
- u64 old_spte, u64 new_spte, int level,
- bool shared)
-{
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
- shared);
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ if (was_leaf && is_accessed_spte(old_spte) &&
+ (!is_present || !is_accessed_spte(new_spte) || pfn_changed))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
}
/*
@@ -662,9 +597,8 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
return -EBUSY;
- __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
- new_spte, iter->level, true);
- handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
+ handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
+ new_spte, iter->level, true);
return 0;
}
@@ -684,8 +618,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
if (ret)
return ret;
- kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
- KVM_PAGES_PER_HPAGE(iter->level));
+ kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
* No other thread can overwrite the removed SPTE as they must either
@@ -701,7 +634,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
/*
- * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
+ * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
* @kvm: KVM instance
* @as_id: Address space ID, i.e. regular vs. SMM
* @sptep: Pointer to the SPTE
@@ -709,23 +642,12 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* @new_spte: The new value that will be set for the SPTE
* @gfn: The base GFN that was (or will be) mapped by the SPTE
* @level: The level _containing_ the SPTE (its parent PT's level)
- * @record_acc_track: Notify the MM subsystem of changes to the accessed state
- * of the page. Should be set unless handling an MMU
- * notifier for access tracking. Leaving record_acc_track
- * unset in that case prevents page accesses from being
- * double counted.
- * @record_dirty_log: Record the page as dirty in the dirty bitmap if
- * appropriate for the change being made. Should be set
- * unless performing certain dirty logging operations.
- * Leaving record_dirty_log unset in that case prevents page
- * writes from being double counted.
*
* Returns the old SPTE value, which _may_ be different than @old_spte if the
* SPTE had voldatile bits.
*/
-static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -740,46 +662,17 @@ static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
- __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
-
- if (record_acc_track)
- handle_changed_spte_acc_track(old_spte, new_spte, level);
- if (record_dirty_log)
- handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
- new_spte, level);
+ handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
return old_spte;
}
-static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte, bool record_acc_track,
- bool record_dirty_log)
+static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
+ u64 new_spte)
{
WARN_ON_ONCE(iter->yielded);
-
- iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
- iter->old_spte, new_spte,
- iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
-}
-
-static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
-}
-
-static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
-}
-
-static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- _tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
+ iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level);
}
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
@@ -871,7 +764,7 @@ retry:
continue;
if (!shared)
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
goto retry;
}
@@ -928,8 +821,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1, true, true);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
+ sp->gfn, sp->role.level + 1);
return true;
}
@@ -963,7 +856,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, 0);
flush = true;
}
@@ -1027,32 +920,49 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
/*
* Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
* is about to be zapped, e.g. in response to a memslots update. The actual
- * zapping is performed asynchronously, so a reference is taken on all roots.
- * Using a separate workqueue makes it easy to ensure that the destruction is
- * performed before the "fast zap" completes, without keeping a separate list
- * of invalidated roots; the list is effectively the list of work items in
- * the workqueue.
+ * zapping is performed asynchronously. Using a separate workqueue makes it
+ * easy to ensure that the destruction is performed before the "fast zap"
+ * completes, without keeping a separate list of invalidated roots; the list is
+ * effectively the list of work items in the workqueue.
*
- * Get a reference even if the root is already invalid, the asynchronous worker
- * assumes it was gifted a reference to the root it processes. Because mmu_lock
- * is held for write, it should be impossible to observe a root with zero refcount,
- * i.e. the list of roots cannot be stale.
- *
- * This has essentially the same effect for the TDP MMU
- * as updating mmu_valid_gen does for the shadow MMU.
+ * Note, the asynchronous worker is gifted the TDP MMU's reference.
+ * See kvm_tdp_mmu_get_vcpu_root_hpa().
*/
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
- list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
- if (!root->role.invalid &&
- !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
+ /*
+ * mmu_lock must be held for write to ensure that a root doesn't become
+ * invalid while there are active readers (invalidating a root while
+ * there are active readers may or may not be problematic in practice,
+ * but it's uncharted territory and not supported).
+ *
+ * Waive the assertion if there are no users of @kvm, i.e. the VM is
+ * being destroyed after all references have been put, or if no vCPUs
+ * have been created (which means there are no roots), i.e. the VM is
+ * being destroyed in an error path of KVM_CREATE_VM.
+ */
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ refcount_read(&kvm->users_count) && kvm->created_vcpus)
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ /*
+ * As above, mmu_lock isn't held when destroying the VM! There can't
+ * be other references to @kvm, i.e. nothing else can invalidate roots
+ * or be consuming roots, but walking the list of roots does need to be
+ * guarded against roots being deleted by the asynchronous zap worker.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+ if (!root->role.invalid) {
root->role.invalid = true;
tdp_mmu_schedule_zap_root(kvm, root);
}
}
+
+ rcu_read_unlock();
}
/*
@@ -1068,7 +978,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
int ret = RET_PF_FIXED;
bool wrprot = false;
- WARN_ON(sp->role.level != fault->goal_level);
+ if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
+ return RET_PF_RETRY;
+
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
@@ -1082,8 +994,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(iter->level + 1));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
/*
* If the page fault was caused by a write but the page is write
@@ -1116,16 +1027,13 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* @kvm: kvm instance
* @iter: a tdp_iter instance currently on the SPTE that should be set
* @sp: The new TDP page table to install.
- * @account_nx: True if this page table is being installed to split a
- * non-executable huge page.
* @shared: This operation is running under the MMU lock in read mode.
*
* Returns: 0 if the new page table was installed. Non-0 if the page table
* could not be installed (e.g. the atomic compare-exchange failed).
*/
static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_mmu_page *sp, bool account_nx,
- bool shared)
+ struct kvm_mmu_page *sp, bool shared)
{
u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
int ret = 0;
@@ -1135,19 +1043,17 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
if (ret)
return ret;
} else {
- tdp_mmu_set_spte(kvm, iter, spte);
+ tdp_mmu_iter_set_spte(kvm, iter, spte);
}
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
- if (account_nx)
- account_huge_nx_page(kvm, sp);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
tdp_account_mmu_page(kvm, sp);
return 0;
}
+static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
+ struct kvm_mmu_page *sp, bool shared);
+
/*
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
@@ -1155,9 +1061,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
+ struct kvm *kvm = vcpu->kvm;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
- int ret;
+ int ret = RET_PF_RETRY;
kvm_mmu_hugepage_adjust(vcpu, fault);
@@ -1166,64 +1073,70 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
rcu_read_lock();
tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ int r;
+
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
- if (iter.level == fault->goal_level)
- break;
-
/*
- * If there is an SPTE mapping a large page at a higher level
- * than the target, that SPTE must be cleared and replaced
- * with a non-leaf SPTE.
+ * If SPTE has been frozen by another thread, just give up and
+ * retry, avoiding unnecessary page table allocation and free.
*/
+ if (is_removed_spte(iter.old_spte))
+ goto retry;
+
+ if (iter.level == fault->goal_level)
+ goto map_target_level;
+
+ /* Step down into the lower level page table if it exists. */
if (is_shadow_present_pte(iter.old_spte) &&
- is_large_pte(iter.old_spte)) {
- if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
- break;
+ !is_large_pte(iter.old_spte))
+ continue;
- /*
- * The iter must explicitly re-read the spte here
- * because the new value informs the !present
- * path below.
- */
- iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
- }
+ /*
+ * The SPTE is either non-present or points to a huge page that
+ * needs to be split.
+ */
+ sp = tdp_mmu_alloc_sp(vcpu);
+ tdp_mmu_init_child_sp(sp, &iter);
- if (!is_shadow_present_pte(iter.old_spte)) {
- bool account_nx = fault->huge_page_disallowed &&
- fault->req_level >= iter.level;
+ sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
- /*
- * If SPTE has been frozen by another thread, just
- * give up and retry, avoiding unnecessary page table
- * allocation and free.
- */
- if (is_removed_spte(iter.old_spte))
- break;
+ if (is_shadow_present_pte(iter.old_spte))
+ r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
+ else
+ r = tdp_mmu_link_sp(kvm, &iter, sp, true);
- sp = tdp_mmu_alloc_sp(vcpu);
- tdp_mmu_init_child_sp(sp, &iter);
+ /*
+ * Force the guest to retry if installing an upper level SPTE
+ * failed, e.g. because a different task modified the SPTE.
+ */
+ if (r) {
+ tdp_mmu_free_sp(sp);
+ goto retry;
+ }
- if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
- tdp_mmu_free_sp(sp);
- break;
- }
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= iter.level) {
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
+ if (sp->nx_huge_page_disallowed)
+ track_possible_nx_huge_page(kvm, sp);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
}
/*
- * Force the guest to retry the access if the upper level SPTEs aren't
- * in place, or if the target leaf SPTE is frozen by another CPU.
+ * The walk aborted before reaching the target level, e.g. because the
+ * iterator detected an upper level SPTE was frozen during traversal.
*/
- if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
- rcu_read_unlock();
- return RET_PF_RETRY;
- }
+ WARN_ON_ONCE(iter.level == fault->goal_level);
+ goto retry;
+map_target_level:
ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
- rcu_read_unlock();
+retry:
+ rcu_read_unlock();
return ret;
}
@@ -1264,33 +1177,42 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
+ *
+ * No need to mark the corresponding PFN as accessed as this call is coming
+ * from the clear_young() or clear_flush_young() notifier, which uses the
+ * return value to determine if the page has been accessed.
*/
static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
- u64 new_spte = 0;
+ u64 new_spte;
/* If we have a non-accessed entry we don't need to change the pte. */
if (!is_accessed_spte(iter->old_spte))
return false;
- new_spte = iter->old_spte;
-
- if (spte_ad_enabled(new_spte)) {
- new_spte &= ~shadow_accessed_mask;
+ if (spte_ad_enabled(iter->old_spte)) {
+ iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
+ iter->old_spte,
+ shadow_accessed_mask,
+ iter->level);
+ new_spte = iter->old_spte & ~shadow_accessed_mask;
} else {
/*
* Capture the dirty status of the page, so that it doesn't get
* lost when the SPTE is marked for access tracking.
*/
- if (is_writable_pte(new_spte))
- kvm_set_pfn_dirty(spte_to_pfn(new_spte));
+ if (is_writable_pte(iter->old_spte))
+ kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte));
- new_spte = mark_spte_for_access_track(new_spte);
+ new_spte = mark_spte_for_access_track(iter->old_spte);
+ iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
+ iter->old_spte, new_spte,
+ iter->level);
}
- tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
-
+ trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
+ iter->old_spte, new_spte);
return true;
}
@@ -1326,15 +1248,15 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* Note, when changing a read-only SPTE, it's not strictly necessary to
* zero the SPTE before setting the new PFN, but doing so preserves the
* invariant that the PFN of a present * leaf SPTE can never change.
- * See __handle_changed_spte().
+ * See handle_changed_spte().
*/
- tdp_mmu_set_spte(kvm, iter, 0);
+ tdp_mmu_iter_set_spte(kvm, iter, 0);
if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
pte_pfn(range->pte));
- tdp_mmu_set_spte(kvm, iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, iter, new_spte);
}
return true;
@@ -1351,7 +1273,7 @@ bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
/*
* No need to handle the remote TLB flush under RCU protection, the
* target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in __handle_changed_spte().
+ * shadow page. See the WARN on pfn_changed in handle_changed_spte().
*/
return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
}
@@ -1472,6 +1394,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
return sp;
}
+/* Note, the caller is responsible for initializing @sp. */
static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_mmu_page *sp, bool shared)
{
@@ -1479,8 +1402,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
const int level = iter->level;
int ret, i;
- tdp_mmu_init_child_sp(sp, iter);
-
/*
* No need for atomics when writing to sp->spt since the page table has
* not been linked in yet and thus is not reachable from any other CPU.
@@ -1496,7 +1417,7 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
* correctness standpoint since the translation will be the same either
* way.
*/
- ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
+ ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
if (ret)
goto out;
@@ -1556,6 +1477,8 @@ retry:
continue;
}
+ tdp_mmu_init_child_sp(sp, &iter);
+
if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
goto retry;
@@ -1608,8 +1531,8 @@ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t start, gfn_t end)
{
+ u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK;
struct tdp_iter iter;
- u64 new_spte;
bool spte_set = false;
rcu_read_lock();
@@ -1622,19 +1545,13 @@ retry:
if (!is_shadow_present_pte(iter.old_spte))
continue;
- if (spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
- if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
goto retry;
spte_set = true;
@@ -1676,8 +1593,9 @@ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
gfn_t gfn, unsigned long mask, bool wrprot)
{
+ u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK :
+ shadow_dirty_mask;
struct tdp_iter iter;
- u64 new_spte;
rcu_read_lock();
@@ -1686,25 +1604,26 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
if (!mask)
break;
+ MMU_WARN_ON(kvm_ad_enabled() &&
+ spte_ad_need_write_protect(iter.old_spte));
+
if (iter.level > PG_LEVEL_4K ||
!(mask & (1UL << (iter.gfn - gfn))))
continue;
mask &= ~(1UL << (iter.gfn - gfn));
- if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
- if (is_writable_pte(iter.old_spte))
- new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- else
- continue;
- } else {
- if (iter.old_spte & shadow_dirty_mask)
- new_spte = iter.old_spte & ~shadow_dirty_mask;
- else
- continue;
- }
+ if (!(iter.old_spte & dbit))
+ continue;
+
+ iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
+ iter.old_spte, dbit,
+ iter.level);
- tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
+ trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
+ iter.old_spte,
+ iter.old_spte & ~dbit);
+ kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte));
}
rcu_read_unlock();
@@ -1822,7 +1741,7 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
if (new_spte == iter.old_spte)
break;
- tdp_mmu_set_spte(kvm, &iter, new_spte);
+ tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index c163f7cc23ca..0a63b1afabd3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -5,6 +5,11 @@
#include <linux/kvm_host.h>
+#include "spte.h"
+
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
@@ -66,31 +71,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
u64 *spte);
#ifdef CONFIG_X86_64
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
-{
- struct kvm_mmu_page *sp;
- hpa_t hpa = mmu->root.hpa;
-
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- /*
- * A NULL shadow page is legal when shadowing a non-paging guest with
- * PAE paging, as the MMU will be direct with root_hpa pointing at the
- * pae_root page, not a shadow page.
- */
- sp = to_shadow_page(hpa);
- return sp && is_tdp_mmu_page(sp) && sp->root_count;
-}
#else
-static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a8502e02f479..9fac1ec03463 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -13,6 +13,7 @@
* Paolo Bonzini <pbonzini@redhat.com>
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index de1fd7369736..1690d41c1830 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -9,6 +9,7 @@
* Gleb Natapov <gleb@redhat.com>
* Wei Huang <wei@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/kvm_host.h>
@@ -28,9 +29,18 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
-static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -83,7 +93,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
#undef __KVM_X86_PMU_OP
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
{
return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
}
@@ -101,10 +111,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
bool skip_pmi = false;
- /* Ignore counters that have been reprogrammed already. */
- if (test_and_set_bit(pmc->idx, pmu->reprogram_pmi))
- return;
-
if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
if (!in_pmi) {
/*
@@ -122,7 +128,6 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
} else {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
}
- kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
if (!pmc->intr || skip_pmi)
return;
@@ -147,12 +152,44 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ /*
+ * Ignore overflow events for counters that are scheduled to be
+ * reprogrammed, e.g. if a PMI for the previous event races with KVM's
+ * handling of a related guest WRMSR.
+ */
+ if (test_and_set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
+ return;
+
__kvm_perf_overflow(pmc, true);
+
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
+
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
}
-static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
- u64 config, bool exclude_user,
- bool exclude_kernel, bool intr)
+static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
+ bool exclude_user, bool exclude_kernel,
+ bool intr)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
struct perf_event *event;
@@ -181,22 +218,12 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
}
if (pebs) {
/*
- * The non-zero precision level of guest event makes the ordinary
- * guest event becomes a guest PEBS event and triggers the host
- * PEBS PMI handler to determine whether the PEBS overflow PMI
- * comes from the host counters or the guest.
- *
* For most PEBS hardware events, the difference in the software
* precision levels of guest and host PEBS events will not affect
* the accuracy of the PEBS profiling result, because the "event IP"
* in the PEBS record is calibrated on the guest side.
- *
- * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
- * could possibly care here is unsupported and needs changes.
*/
- attr.precise_ip = 1;
- if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
- attr.precise_ip = 3;
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}
event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -204,14 +231,14 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
if (IS_ERR(event)) {
pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
PTR_ERR(event), pmc->idx);
- return;
+ return PTR_ERR(event);
}
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
- clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
pmc->is_paused = false;
pmc->intr = intr || pebs;
+ return 0;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
@@ -233,7 +260,8 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return false;
/* recalibrate sample period and check if it's accepted by perf core */
- if (perf_event_period(pmc->perf_event,
+ if (is_sampling_event(pmc->perf_event) &&
+ perf_event_period(pmc->perf_event,
get_sample_period(pmc, pmc->counter)))
return false;
@@ -245,55 +273,140 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
perf_event_enable(pmc->perf_event);
pmc->is_paused = false;
- clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
}
-static int cmp_u64(const void *pa, const void *pb)
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
{
- u64 a = *(u64 *)pa;
- u64 b = *(u64 *)pb;
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
return (a > b) - (a < b);
}
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
static bool check_pmu_event_filter(struct kvm_pmc *pmc)
{
- struct kvm_pmu_event_filter *filter;
+ struct kvm_x86_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- bool allow_event = true;
- __u64 key;
- int idx;
if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
return false;
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (!filter)
- goto out;
+ return true;
- if (pmc_is_gp(pmc)) {
- key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
- if (bsearch(&key, filter->events, filter->nevents,
- sizeof(__u64), cmp_u64))
- allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
- else
- allow_event = filter->action == KVM_PMU_EVENT_DENY;
- } else {
- idx = pmc->idx - INTEL_PMC_IDX_FIXED;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- }
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
-out:
- return allow_event;
+ return is_fixed_event_allowed(filter, pmc->idx);
}
-void reprogram_counter(struct kvm_pmc *pmc)
+static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
+{
+ return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
+ check_pmu_event_filter(pmc);
+}
+
+static void reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
@@ -302,11 +415,11 @@ void reprogram_counter(struct kvm_pmc *pmc)
pmc_pause_counter(pmc);
- if (!pmc_speculative_in_use(pmc) || !pmc_is_enabled(pmc))
- return;
+ if (!pmc_event_is_allowed(pmc))
+ goto reprogram_complete;
- if (!check_pmu_event_filter(pmc))
- return;
+ if (pmc->counter < pmc->prev_counter)
+ __kvm_perf_overflow(pmc, false);
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
printk_once("kvm pmu: pin control bit is ignored\n");
@@ -324,18 +437,29 @@ void reprogram_counter(struct kvm_pmc *pmc)
}
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
- return;
+ goto reprogram_complete;
pmc_release_perf_event(pmc);
pmc->current_config = new_config;
- pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
- (eventsel & pmu->raw_event_mask),
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT);
+
+ /*
+ * If reprogramming fails, e.g. due to contention, leave the counter's
+ * regprogram bit set, i.e. opportunistically try again on the next PMU
+ * refresh. Don't make a new request as doing so can stall the guest
+ * if reprogramming repeatedly fails.
+ */
+ if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT))
+ return;
+
+reprogram_complete:
+ clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->prev_counter = 0;
}
-EXPORT_SYMBOL_GPL(reprogram_counter);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
@@ -345,10 +469,11 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
- if (unlikely(!pmc || !pmc->perf_event)) {
+ if (unlikely(!pmc)) {
clear_bit(bit, pmu->reprogram_pmi);
continue;
}
+
reprogram_counter(pmc);
}
@@ -418,9 +543,9 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (!pmc)
return 1;
- if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) &&
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
(static_call(kvm_x86_get_cpl)(vcpu) != 0) &&
- (kvm_read_cr0(vcpu) & X86_CR0_PE))
+ kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;
*data = pmc_read_counter(pmc) & mask;
@@ -467,6 +592,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
+ return;
+
+ bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
static_call(kvm_x86_pmu_refresh)(vcpu);
}
@@ -522,14 +651,9 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
{
- u64 prev_count;
-
- prev_count = pmc->counter;
+ pmc->prev_counter = pmc->counter;
pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc);
-
- reprogram_counter(pmc);
- if (pmc->counter < prev_count)
- __kvm_perf_overflow(pmc, false);
+ kvm_pmu_request_counter_reprogram(pmc);
}
static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
@@ -542,12 +666,15 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
- u64 config = pmc->current_config;
+ u64 config;
if (pmc_is_gp(pmc)) {
+ config = pmc->eventsel;
select_os = config & ARCH_PERFMON_EVENTSEL_OS;
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
+ config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
+ pmc->idx - INTEL_PMC_IDX_FIXED);
select_os = config & 0x1;
select_user = config & 0x2;
}
@@ -564,7 +691,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
- if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
+ if (!pmc || !pmc_event_is_allowed(pmc))
continue;
/* Ignore checks for edge detect, pin control, invert and CMASK bits */
@@ -574,48 +701,143 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
- struct kvm_pmu_event_filter tmp, *filter;
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
+ struct kvm_vcpu *vcpu;
+ unsigned long i;
size_t size;
int r;
- if (copy_from_user(&tmp, argp, sizeof(tmp)))
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
return -EFAULT;
if (tmp.action != KVM_PMU_EVENT_ALLOW &&
tmp.action != KVM_PMU_EVENT_DENY)
return -EINVAL;
- if (tmp.flags != 0)
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
return -EINVAL;
if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
return -E2BIG;
size = struct_size(filter, events, tmp.nevents);
- filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (!filter)
return -ENOMEM;
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
r = -EFAULT;
- if (copy_from_user(filter, argp, size))
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
goto cleanup;
- /* Ensure nevents can't be changed between the user copies. */
- *filter = tmp;
-
- /*
- * Sort the in-kernel list so that we can search it with bsearch.
- */
- sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
mutex_unlock(&kvm->lock);
-
synchronize_srcu_expedited(&kvm->srcu);
+
+ BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
+ sizeof(((struct kvm_pmu *)0)->__reprogram_pmi));
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ atomic64_set(&vcpu_to_pmu(vcpu)->__reprogram_pmi, -1ull);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
+
r = 0;
cleanup:
kfree(filter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 5cc5721f260b..5c7bbf03b599 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -18,12 +18,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-struct kvm_event_hw_type_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
-};
-
struct kvm_pmu_ops {
bool (*hw_event_available)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
@@ -40,6 +34,9 @@ struct kvm_pmu_ops {
void (*reset)(struct kvm_vcpu *vcpu);
void (*deliver_pmi)(struct kvm_vcpu *vcpu);
void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
};
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
@@ -140,7 +137,8 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
static inline void pmc_update_sample_period(struct kvm_pmc *pmc)
{
- if (!pmc->perf_event || pmc->is_paused)
+ if (!pmc->perf_event || pmc->is_paused ||
+ !is_sampling_event(pmc->perf_event))
return;
perf_event_period(pmc->perf_event,
@@ -160,30 +158,48 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
extern struct x86_pmu_capability kvm_pmu_cap;
-static inline void kvm_init_pmu_capability(void)
+static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
- perf_get_x86_pmu_capability(&kvm_pmu_cap);
-
- /*
- * For Intel, only support guest architectural pmu
- * on a host with architectural pmu.
- */
- if ((is_intel && !kvm_pmu_cap.version) || !kvm_pmu_cap.num_counters_gp)
+ /*
+ * Hybrid PMUs don't play nice with virtualization without careful
+ * configuration by userspace, and KVM's APIs for reporting supported
+ * vPMU features do not account for hybrid PMUs. Disable vPMU support
+ * for hybrid PMUs until KVM gains a way to let userspace opt-in.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
enable_pmu = false;
+ if (enable_pmu) {
+ perf_get_x86_pmu_capability(&kvm_pmu_cap);
+
+ /*
+ * For Intel, only support guest architectural pmu
+ * on a host with architectural pmu.
+ */
+ if ((is_intel && !kvm_pmu_cap.version) ||
+ !kvm_pmu_cap.num_counters_gp)
+ enable_pmu = false;
+ }
+
if (!enable_pmu) {
memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap));
return;
}
kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
KVM_PMC_MAX_FIXED);
}
-void reprogram_counter(struct kvm_pmc *pmc);
+static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
+{
+ set_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+}
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index a19d473d0184..a5717282bb9c 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -7,22 +7,45 @@
#include <asm/cpufeatures.h>
/*
- * Hardware-defined CPUID leafs that are scattered in the kernel, but need to
- * be directly used by KVM. Note, these word values conflict with the kernel's
- * "bug" caps, but KVM doesn't use those.
+ * Hardware-defined CPUID leafs that are either scattered by the kernel or are
+ * unknown to the kernel, but need to be directly used by KVM. Note, these
+ * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
*/
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
+ CPUID_7_1_EDX,
+ CPUID_8000_0007_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
};
+/*
+ * Define a KVM-only feature flag.
+ *
+ * For features that are scattered by cpufeatures.h, __feature_translate() also
+ * needs to be updated to translate the kernel-defined feature into the
+ * KVM-defined feature.
+ *
+ * For features that are 100% KVM-only, i.e. not defined by cpufeatures.h,
+ * forego the intermediate KVM_X86_FEATURE and directly define X86_FEATURE_* so
+ * that X86_FEATURE_* can be used in KVM. No __feature_translate() handling is
+ * needed in this case.
+ */
#define KVM_X86_FEATURE(w, f) ((w)*32 + (f))
/* Intel-defined SGX sub-features, CPUID level 0x12 (EAX). */
#define KVM_X86_FEATURE_SGX1 KVM_X86_FEATURE(CPUID_12_EAX, 0)
#define KVM_X86_FEATURE_SGX2 KVM_X86_FEATURE(CPUID_12_EAX, 1)
+#define KVM_X86_FEATURE_SGX_EDECCSSA KVM_X86_FEATURE(CPUID_12_EAX, 11)
+
+/* Intel-defined sub-features, CPUID level 0x00000007:1 (EDX) */
+#define X86_FEATURE_AVX_VNNI_INT8 KVM_X86_FEATURE(CPUID_7_1_EDX, 4)
+#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
+#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
struct cpuid_reg {
u32 function;
@@ -48,6 +71,9 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
[CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
+ [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
+ [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
};
/*
@@ -78,6 +104,10 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX1;
else if (x86_feature == X86_FEATURE_SGX2)
return KVM_X86_FEATURE_SGX2;
+ else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
+ return KVM_X86_FEATURE_SGX_EDECCSSA;
+ else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
+ return KVM_X86_FEATURE_CONSTANT_TSC;
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
new file mode 100644
index 000000000000..b42111a24cc2
--- /dev/null
+++ b/arch/x86/kvm/smm.c
@@ -0,0 +1,648 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kvm_host.h>
+#include "x86.h"
+#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
+#include "smm.h"
+#include "cpuid.h"
+#include "trace.h"
+
+#define CHECK_SMRAM32_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_32, field, offset - 0xFE00)
+
+#define CHECK_SMRAM64_OFFSET(field, offset) \
+ ASSERT_STRUCT_OFFSET(struct kvm_smram_state_64, field, offset - 0xFE00)
+
+static void check_smram_offsets(void)
+{
+ /* 32 bit SMRAM image */
+ CHECK_SMRAM32_OFFSET(reserved1, 0xFE00);
+ CHECK_SMRAM32_OFFSET(smbase, 0xFEF8);
+ CHECK_SMRAM32_OFFSET(smm_revision, 0xFEFC);
+ CHECK_SMRAM32_OFFSET(io_inst_restart, 0xFF00);
+ CHECK_SMRAM32_OFFSET(auto_hlt_restart, 0xFF02);
+ CHECK_SMRAM32_OFFSET(io_restart_rdi, 0xFF04);
+ CHECK_SMRAM32_OFFSET(io_restart_rcx, 0xFF08);
+ CHECK_SMRAM32_OFFSET(io_restart_rsi, 0xFF0C);
+ CHECK_SMRAM32_OFFSET(io_restart_rip, 0xFF10);
+ CHECK_SMRAM32_OFFSET(cr4, 0xFF14);
+ CHECK_SMRAM32_OFFSET(reserved2, 0xFF18);
+ CHECK_SMRAM32_OFFSET(int_shadow, 0xFF1A);
+ CHECK_SMRAM32_OFFSET(reserved3, 0xFF1B);
+ CHECK_SMRAM32_OFFSET(ds, 0xFF2C);
+ CHECK_SMRAM32_OFFSET(fs, 0xFF38);
+ CHECK_SMRAM32_OFFSET(gs, 0xFF44);
+ CHECK_SMRAM32_OFFSET(idtr, 0xFF50);
+ CHECK_SMRAM32_OFFSET(tr, 0xFF5C);
+ CHECK_SMRAM32_OFFSET(gdtr, 0xFF6C);
+ CHECK_SMRAM32_OFFSET(ldtr, 0xFF78);
+ CHECK_SMRAM32_OFFSET(es, 0xFF84);
+ CHECK_SMRAM32_OFFSET(cs, 0xFF90);
+ CHECK_SMRAM32_OFFSET(ss, 0xFF9C);
+ CHECK_SMRAM32_OFFSET(es_sel, 0xFFA8);
+ CHECK_SMRAM32_OFFSET(cs_sel, 0xFFAC);
+ CHECK_SMRAM32_OFFSET(ss_sel, 0xFFB0);
+ CHECK_SMRAM32_OFFSET(ds_sel, 0xFFB4);
+ CHECK_SMRAM32_OFFSET(fs_sel, 0xFFB8);
+ CHECK_SMRAM32_OFFSET(gs_sel, 0xFFBC);
+ CHECK_SMRAM32_OFFSET(ldtr_sel, 0xFFC0);
+ CHECK_SMRAM32_OFFSET(tr_sel, 0xFFC4);
+ CHECK_SMRAM32_OFFSET(dr7, 0xFFC8);
+ CHECK_SMRAM32_OFFSET(dr6, 0xFFCC);
+ CHECK_SMRAM32_OFFSET(gprs, 0xFFD0);
+ CHECK_SMRAM32_OFFSET(eip, 0xFFF0);
+ CHECK_SMRAM32_OFFSET(eflags, 0xFFF4);
+ CHECK_SMRAM32_OFFSET(cr3, 0xFFF8);
+ CHECK_SMRAM32_OFFSET(cr0, 0xFFFC);
+
+ /* 64 bit SMRAM image */
+ CHECK_SMRAM64_OFFSET(es, 0xFE00);
+ CHECK_SMRAM64_OFFSET(cs, 0xFE10);
+ CHECK_SMRAM64_OFFSET(ss, 0xFE20);
+ CHECK_SMRAM64_OFFSET(ds, 0xFE30);
+ CHECK_SMRAM64_OFFSET(fs, 0xFE40);
+ CHECK_SMRAM64_OFFSET(gs, 0xFE50);
+ CHECK_SMRAM64_OFFSET(gdtr, 0xFE60);
+ CHECK_SMRAM64_OFFSET(ldtr, 0xFE70);
+ CHECK_SMRAM64_OFFSET(idtr, 0xFE80);
+ CHECK_SMRAM64_OFFSET(tr, 0xFE90);
+ CHECK_SMRAM64_OFFSET(io_restart_rip, 0xFEA0);
+ CHECK_SMRAM64_OFFSET(io_restart_rcx, 0xFEA8);
+ CHECK_SMRAM64_OFFSET(io_restart_rsi, 0xFEB0);
+ CHECK_SMRAM64_OFFSET(io_restart_rdi, 0xFEB8);
+ CHECK_SMRAM64_OFFSET(io_restart_dword, 0xFEC0);
+ CHECK_SMRAM64_OFFSET(reserved1, 0xFEC4);
+ CHECK_SMRAM64_OFFSET(io_inst_restart, 0xFEC8);
+ CHECK_SMRAM64_OFFSET(auto_hlt_restart, 0xFEC9);
+ CHECK_SMRAM64_OFFSET(amd_nmi_mask, 0xFECA);
+ CHECK_SMRAM64_OFFSET(int_shadow, 0xFECB);
+ CHECK_SMRAM64_OFFSET(reserved2, 0xFECC);
+ CHECK_SMRAM64_OFFSET(efer, 0xFED0);
+ CHECK_SMRAM64_OFFSET(svm_guest_flag, 0xFED8);
+ CHECK_SMRAM64_OFFSET(svm_guest_vmcb_gpa, 0xFEE0);
+ CHECK_SMRAM64_OFFSET(svm_guest_virtual_int, 0xFEE8);
+ CHECK_SMRAM64_OFFSET(reserved3, 0xFEF0);
+ CHECK_SMRAM64_OFFSET(smm_revison, 0xFEFC);
+ CHECK_SMRAM64_OFFSET(smbase, 0xFF00);
+ CHECK_SMRAM64_OFFSET(reserved4, 0xFF04);
+ CHECK_SMRAM64_OFFSET(ssp, 0xFF18);
+ CHECK_SMRAM64_OFFSET(svm_guest_pat, 0xFF20);
+ CHECK_SMRAM64_OFFSET(svm_host_efer, 0xFF28);
+ CHECK_SMRAM64_OFFSET(svm_host_cr4, 0xFF30);
+ CHECK_SMRAM64_OFFSET(svm_host_cr3, 0xFF38);
+ CHECK_SMRAM64_OFFSET(svm_host_cr0, 0xFF40);
+ CHECK_SMRAM64_OFFSET(cr4, 0xFF48);
+ CHECK_SMRAM64_OFFSET(cr3, 0xFF50);
+ CHECK_SMRAM64_OFFSET(cr0, 0xFF58);
+ CHECK_SMRAM64_OFFSET(dr7, 0xFF60);
+ CHECK_SMRAM64_OFFSET(dr6, 0xFF68);
+ CHECK_SMRAM64_OFFSET(rflags, 0xFF70);
+ CHECK_SMRAM64_OFFSET(rip, 0xFF78);
+ CHECK_SMRAM64_OFFSET(gprs, 0xFF80);
+
+ BUILD_BUG_ON(sizeof(union kvm_smram) != 512);
+}
+
+#undef CHECK_SMRAM64_OFFSET
+#undef CHECK_SMRAM32_OFFSET
+
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
+{
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
+
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_32 *state,
+ u32 *selector, int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ *selector = seg.selector;
+ state->base = seg.base;
+ state->limit = seg.limit;
+ state->flags = enter_smm_get_segment_flags(&seg);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu,
+ struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment seg;
+
+ kvm_get_segment(vcpu, &seg, n);
+ state->selector = seg.selector;
+ state->attributes = enter_smm_get_segment_flags(&seg) >> 8;
+ state->limit = seg.limit;
+ state->base = seg.base;
+}
+#endif
+
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_32 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->eflags = kvm_get_rflags(vcpu);
+ smram->eip = kvm_rip_read(vcpu);
+
+ for (i = 0; i < 8; i++)
+ smram->gprs[i] = kvm_register_read_raw(vcpu, i);
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = (u32)val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = (u32)val;
+
+ enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
+ enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.base = dt.address;
+ smram->gdtr.limit = dt.size;
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.base = dt.address;
+ smram->idtr.limit = dt.size;
+
+ enter_smm_save_seg_32(vcpu, &smram->es, &smram->es_sel, VCPU_SREG_ES);
+ enter_smm_save_seg_32(vcpu, &smram->cs, &smram->cs_sel, VCPU_SREG_CS);
+ enter_smm_save_seg_32(vcpu, &smram->ss, &smram->ss_sel, VCPU_SREG_SS);
+
+ enter_smm_save_seg_32(vcpu, &smram->ds, &smram->ds_sel, VCPU_SREG_DS);
+ enter_smm_save_seg_32(vcpu, &smram->fs, &smram->fs_sel, VCPU_SREG_FS);
+ enter_smm_save_seg_32(vcpu, &smram->gs, &smram->gs_sel, VCPU_SREG_GS);
+
+ smram->cr4 = kvm_read_cr4(vcpu);
+ smram->smm_revision = 0x00020000;
+ smram->smbase = vcpu->arch.smbase;
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
+ struct kvm_smram_state_64 *smram)
+{
+ struct desc_ptr dt;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ smram->gprs[15 - i] = kvm_register_read_raw(vcpu, i);
+
+ smram->rip = kvm_rip_read(vcpu);
+ smram->rflags = kvm_get_rflags(vcpu);
+
+
+ kvm_get_dr(vcpu, 6, &val);
+ smram->dr6 = val;
+ kvm_get_dr(vcpu, 7, &val);
+ smram->dr7 = val;
+
+ smram->cr0 = kvm_read_cr0(vcpu);
+ smram->cr3 = kvm_read_cr3(vcpu);
+ smram->cr4 = kvm_read_cr4(vcpu);
+
+ smram->smbase = vcpu->arch.smbase;
+ smram->smm_revison = 0x00020064;
+
+ smram->efer = vcpu->arch.efer;
+
+ enter_smm_save_seg_64(vcpu, &smram->tr, VCPU_SREG_TR);
+
+ static_call(kvm_x86_get_idt)(vcpu, &dt);
+ smram->idtr.limit = dt.size;
+ smram->idtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->ldtr, VCPU_SREG_LDTR);
+
+ static_call(kvm_x86_get_gdt)(vcpu, &dt);
+ smram->gdtr.limit = dt.size;
+ smram->gdtr.base = dt.address;
+
+ enter_smm_save_seg_64(vcpu, &smram->es, VCPU_SREG_ES);
+ enter_smm_save_seg_64(vcpu, &smram->cs, VCPU_SREG_CS);
+ enter_smm_save_seg_64(vcpu, &smram->ss, VCPU_SREG_SS);
+ enter_smm_save_seg_64(vcpu, &smram->ds, VCPU_SREG_DS);
+ enter_smm_save_seg_64(vcpu, &smram->fs, VCPU_SREG_FS);
+ enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS);
+
+ smram->int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+}
+#endif
+
+void enter_smm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ struct desc_ptr dt;
+ unsigned long cr0;
+ union kvm_smram smram;
+
+ check_smram_offsets();
+
+ memset(smram.bytes, 0, sizeof(smram.bytes));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ enter_smm_save_state_64(vcpu, &smram.smram64);
+ else
+#endif
+ enter_smm_save_state_32(vcpu, &smram.smram32);
+
+ /*
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
+ *
+ * Kill the VM in the unlikely case of failure, because the VM
+ * can be in undefined state in this case.
+ */
+ if (static_call(kvm_x86_enter_smm)(vcpu, &smram))
+ goto error;
+
+ kvm_smm_changed(vcpu, true);
+
+ if (kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, &smram, sizeof(smram)))
+ goto error;
+
+ if (static_call(kvm_x86_get_nmi_mask)(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ static_call(kvm_x86_set_nmi_mask)(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ static_call(kvm_x86_set_cr0)(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ if (WARN_ON_ONCE(kvm_set_dr(vcpu, 7, DR7_FIXED_1)))
+ goto error;
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (static_call(kvm_x86_set_efer)(vcpu, 0))
+ goto error;
+#endif
+
+ kvm_update_cpuid_runtime(vcpu);
+ kvm_mmu_reset_context(vcpu);
+ return;
+error:
+ kvm_vm_dead(vcpu->kvm);
+}
+
+static void rsm_set_desc_flags(struct kvm_segment *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->db = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->present = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+
+ desc->unusable = !desc->present;
+ desc->padding = 0;
+}
+
+static int rsm_load_seg_32(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_32 *state,
+ u16 selector, int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = selector;
+ desc.base = state->base;
+ desc.limit = state->limit;
+ rsm_set_desc_flags(&desc, state->flags);
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+
+#ifdef CONFIG_X86_64
+
+static int rsm_load_seg_64(struct kvm_vcpu *vcpu,
+ const struct kvm_smm_seg_state_64 *state,
+ int n)
+{
+ struct kvm_segment desc;
+
+ desc.selector = state->selector;
+ rsm_set_desc_flags(&desc, state->attributes << 8);
+ desc.limit = state->limit;
+ desc.base = state->base;
+ kvm_set_segment(vcpu, &desc, n);
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+static int rsm_enter_protected_mode(struct kvm_vcpu *vcpu,
+ u64 cr0, u64 cr3, u64 cr4)
+{
+ int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = kvm_set_cr3(vcpu, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = kvm_set_cr0(vcpu, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = kvm_set_cr4(vcpu, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = kvm_set_cr3(vcpu, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_32 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ ctxt->eflags = smstate->eflags | X86_EFLAGS_FIXED;
+ ctxt->_eip = smstate->eip;
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = smstate->gprs[i];
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_32(vcpu, &smstate->tr, smstate->tr_sel, VCPU_SREG_TR);
+ rsm_load_seg_32(vcpu, &smstate->ldtr, smstate->ldtr_sel, VCPU_SREG_LDTR);
+
+ dt.address = smstate->gdtr.base;
+ dt.size = smstate->gdtr.limit;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ dt.address = smstate->idtr.base;
+ dt.size = smstate->idtr.limit;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_32(vcpu, &smstate->es, smstate->es_sel, VCPU_SREG_ES);
+ rsm_load_seg_32(vcpu, &smstate->cs, smstate->cs_sel, VCPU_SREG_CS);
+ rsm_load_seg_32(vcpu, &smstate->ss, smstate->ss_sel, VCPU_SREG_SS);
+
+ rsm_load_seg_32(vcpu, &smstate->ds, smstate->ds_sel, VCPU_SREG_DS);
+ rsm_load_seg_32(vcpu, &smstate->fs, smstate->fs_sel, VCPU_SREG_FS);
+ rsm_load_seg_32(vcpu, &smstate->gs, smstate->gs_sel, VCPU_SREG_GS);
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0,
+ smstate->cr3, smstate->cr4);
+
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return r;
+}
+
+#ifdef CONFIG_X86_64
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
+ const struct kvm_smram_state_64 *smstate)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ struct desc_ptr dt;
+ int i, r;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = smstate->gprs[15 - i];
+
+ ctxt->_eip = smstate->rip;
+ ctxt->eflags = smstate->rflags | X86_EFLAGS_FIXED;
+
+ if (kvm_set_dr(vcpu, 6, smstate->dr6))
+ return X86EMUL_UNHANDLEABLE;
+ if (kvm_set_dr(vcpu, 7, smstate->dr7))
+ return X86EMUL_UNHANDLEABLE;
+
+ vcpu->arch.smbase = smstate->smbase;
+
+ if (kvm_set_msr(vcpu, MSR_EFER, smstate->efer & ~EFER_LMA))
+ return X86EMUL_UNHANDLEABLE;
+
+ rsm_load_seg_64(vcpu, &smstate->tr, VCPU_SREG_TR);
+
+ dt.size = smstate->idtr.limit;
+ dt.address = smstate->idtr.base;
+ static_call(kvm_x86_set_idt)(vcpu, &dt);
+
+ rsm_load_seg_64(vcpu, &smstate->ldtr, VCPU_SREG_LDTR);
+
+ dt.size = smstate->gdtr.limit;
+ dt.address = smstate->gdtr.base;
+ static_call(kvm_x86_set_gdt)(vcpu, &dt);
+
+ r = rsm_enter_protected_mode(vcpu, smstate->cr0, smstate->cr3, smstate->cr4);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+
+ rsm_load_seg_64(vcpu, &smstate->es, VCPU_SREG_ES);
+ rsm_load_seg_64(vcpu, &smstate->cs, VCPU_SREG_CS);
+ rsm_load_seg_64(vcpu, &smstate->ss, VCPU_SREG_SS);
+ rsm_load_seg_64(vcpu, &smstate->ds, VCPU_SREG_DS);
+ rsm_load_seg_64(vcpu, &smstate->fs, VCPU_SREG_FS);
+ rsm_load_seg_64(vcpu, &smstate->gs, VCPU_SREG_GS);
+
+ static_call(kvm_x86_set_interrupt_shadow)(vcpu, 0);
+ ctxt->interruptibility = (u8)smstate->int_shadow;
+
+ return X86EMUL_CONTINUE;
+}
+#endif
+
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = ctxt->vcpu;
+ unsigned long cr0;
+ union kvm_smram smram;
+ u64 smbase;
+ int ret;
+
+ smbase = vcpu->arch.smbase;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfe00, smram.bytes, sizeof(smram));
+ if (ret < 0)
+ return X86EMUL_UNHANDLEABLE;
+
+ if ((vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK) == 0)
+ static_call(kvm_x86_set_nmi_mask)(vcpu, false);
+
+ kvm_smm_changed(vcpu, false);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU
+ * supports long mode.
+ */
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ struct kvm_segment cs_desc;
+ unsigned long cr4;
+
+ /* Zero CR4.PCIDE before CR0.PG. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PCIDE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PCIDE);
+
+ /* A 32-bit code segment is required to clear EFER.LMA. */
+ memset(&cs_desc, 0, sizeof(cs_desc));
+ cs_desc.type = 0xb;
+ cs_desc.s = cs_desc.g = cs_desc.present = 1;
+ kvm_set_segment(vcpu, &cs_desc, VCPU_SREG_CS);
+ }
+#endif
+
+ /* For the 64-bit case, this will clear EFER.LMA. */
+ cr0 = kvm_read_cr0(vcpu);
+ if (cr0 & X86_CR0_PE)
+ kvm_set_cr0(vcpu, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
+ unsigned long cr4, efer;
+
+ /* Clear CR4.PAE before clearing EFER.LME. */
+ cr4 = kvm_read_cr4(vcpu);
+ if (cr4 & X86_CR4_PAE)
+ kvm_set_cr4(vcpu, cr4 & ~X86_CR4_PAE);
+
+ /* And finally go back to 32-bit mode. */
+ efer = 0;
+ kvm_set_msr(vcpu, MSR_EFER, efer);
+ }
+#endif
+
+ /*
+ * Give leave_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (static_call(kvm_x86_leave_smm)(vcpu, &smram))
+ return X86EMUL_UNHANDLEABLE;
+
+#ifdef CONFIG_X86_64
+ if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return rsm_load_state_64(ctxt, &smram.smram64);
+ else
+#endif
+ return rsm_load_state_32(ctxt, &smram.smram32);
+}
diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h
new file mode 100644
index 000000000000..a1cf2ac5bd78
--- /dev/null
+++ b/arch/x86/kvm/smm.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASM_KVM_SMM_H
+#define ASM_KVM_SMM_H
+
+#include <linux/build_bug.h>
+
+#ifdef CONFIG_KVM_SMM
+
+
+/*
+ * 32 bit KVM's emulated SMM layout. Based on Intel P6 layout
+ * (https://www.sandpile.org/x86/smm.htm).
+ */
+
+struct kvm_smm_seg_state_32 {
+ u32 flags;
+ u32 limit;
+ u32 base;
+} __packed;
+
+struct kvm_smram_state_32 {
+ u32 reserved1[62];
+ u32 smbase;
+ u32 smm_revision;
+ u16 io_inst_restart;
+ u16 auto_hlt_restart;
+ u32 io_restart_rdi;
+ u32 io_restart_rcx;
+ u32 io_restart_rsi;
+ u32 io_restart_rip;
+ u32 cr4;
+
+ /* A20M#, CPL, shutdown and other reserved/undocumented fields */
+ u16 reserved2;
+ u8 int_shadow; /* KVM extension */
+ u8 reserved3[17];
+
+ struct kvm_smm_seg_state_32 ds;
+ struct kvm_smm_seg_state_32 fs;
+ struct kvm_smm_seg_state_32 gs;
+ struct kvm_smm_seg_state_32 idtr; /* IDTR has only base and limit */
+ struct kvm_smm_seg_state_32 tr;
+ u32 reserved;
+ struct kvm_smm_seg_state_32 gdtr; /* GDTR has only base and limit */
+ struct kvm_smm_seg_state_32 ldtr;
+ struct kvm_smm_seg_state_32 es;
+ struct kvm_smm_seg_state_32 cs;
+ struct kvm_smm_seg_state_32 ss;
+
+ u32 es_sel;
+ u32 cs_sel;
+ u32 ss_sel;
+ u32 ds_sel;
+ u32 fs_sel;
+ u32 gs_sel;
+ u32 ldtr_sel;
+ u32 tr_sel;
+
+ u32 dr7;
+ u32 dr6;
+ u32 gprs[8]; /* GPRS in the "natural" X86 order (EAX/ECX/EDX.../EDI) */
+ u32 eip;
+ u32 eflags;
+ u32 cr3;
+ u32 cr0;
+} __packed;
+
+
+/* 64 bit KVM's emulated SMM layout. Based on AMD64 layout */
+
+struct kvm_smm_seg_state_64 {
+ u16 selector;
+ u16 attributes;
+ u32 limit;
+ u64 base;
+};
+
+struct kvm_smram_state_64 {
+
+ struct kvm_smm_seg_state_64 es;
+ struct kvm_smm_seg_state_64 cs;
+ struct kvm_smm_seg_state_64 ss;
+ struct kvm_smm_seg_state_64 ds;
+ struct kvm_smm_seg_state_64 fs;
+ struct kvm_smm_seg_state_64 gs;
+ struct kvm_smm_seg_state_64 gdtr; /* GDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 ldtr;
+ struct kvm_smm_seg_state_64 idtr; /* IDTR has only base and limit*/
+ struct kvm_smm_seg_state_64 tr;
+
+ /* I/O restart and auto halt restart are not implemented by KVM */
+ u64 io_restart_rip;
+ u64 io_restart_rcx;
+ u64 io_restart_rsi;
+ u64 io_restart_rdi;
+ u32 io_restart_dword;
+ u32 reserved1;
+ u8 io_inst_restart;
+ u8 auto_hlt_restart;
+ u8 amd_nmi_mask; /* Documented in AMD BKDG as NMI mask, not used by KVM */
+ u8 int_shadow;
+ u32 reserved2;
+
+ u64 efer;
+
+ /*
+ * Two fields below are implemented on AMD only, to store
+ * SVM guest vmcb address if the #SMI was received while in the guest mode.
+ */
+ u64 svm_guest_flag;
+ u64 svm_guest_vmcb_gpa;
+ u64 svm_guest_virtual_int; /* unknown purpose, not implemented */
+
+ u32 reserved3[3];
+ u32 smm_revison;
+ u32 smbase;
+ u32 reserved4[5];
+
+ /* ssp and svm_* fields below are not implemented by KVM */
+ u64 ssp;
+ u64 svm_guest_pat;
+ u64 svm_host_efer;
+ u64 svm_host_cr4;
+ u64 svm_host_cr3;
+ u64 svm_host_cr0;
+
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u64 gprs[16]; /* GPRS in a reversed "natural" X86 order (R15/R14/../RCX/RAX.) */
+};
+
+union kvm_smram {
+ struct kvm_smram_state_64 smram64;
+ struct kvm_smram_state_32 smram32;
+ u8 bytes[512];
+};
+
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ return 0;
+}
+
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
+void kvm_smm_changed(struct kvm_vcpu *vcpu, bool in_smm);
+void enter_smm(struct kvm_vcpu *vcpu);
+int emulator_leave_smm(struct x86_emulate_ctxt *ctxt);
+void process_smi(struct kvm_vcpu *vcpu);
+#else
+static inline int kvm_inject_smi(struct kvm_vcpu *vcpu) { return -ENOTTY; }
+static inline bool is_smm(struct kvm_vcpu *vcpu) { return false; }
+
+/*
+ * emulator_leave_smm is used as a function pointer, so the
+ * stub is defined in x86.c.
+ */
+#endif
+
+#endif
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6919dee69f18..cfc8ab773025 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
@@ -27,19 +27,38 @@
#include "irq.h"
#include "svm.h"
-/* AVIC GATAG is encoded using VM and VCPU IDs */
-#define AVIC_VCPU_ID_BITS 8
-#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+/*
+ * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID,
+ * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry
+ * if an interrupt can't be delivered, e.g. because the vCPU isn't running.
+ *
+ * For the vCPU ID, use however many bits are currently allowed for the max
+ * guest physical APIC ID (limited by the size of the physical ID table), and
+ * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the
+ * size of the GATag is defined by hardware (32 bits), but is an opaque value
+ * as far as hardware is concerned.
+ */
+#define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK
-#define AVIC_VM_ID_BITS 24
-#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
-#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+#define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK)
+#define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT)
-#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
- (y & AVIC_VCPU_ID_MASK))
-#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK)
#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+#define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \
+ ((vcpu_id) & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG(vm_id, vcpu_id) \
+({ \
+ u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \
+ \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \
+ WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \
+ ga_tag; \
+})
+
+static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u);
+
static bool force_avic;
module_param_unsafe(force_avic, bool, 0444);
@@ -53,7 +72,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
-enum avic_modes avic_mode;
+bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
@@ -72,20 +91,25 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- /* Note:
- * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
- * MSR accesses, while interrupt injection to a running vCPU
- * can be achieved using AVIC doorbell. The AVIC hardware still
- * accelerate MMIO accesses, but this does not cause any harm
- * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
*/
- if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
- avic_mode == AVIC_MODE_X2) {
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
@@ -241,8 +265,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+ (index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
@@ -250,47 +274,14 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
return &avic_physical_id_table[index];
}
-/*
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
-static int avic_alloc_access_page(struct kvm *kvm)
-{
- void __user *ret;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
-
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
-
- ret = __x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
- if (IS_ERR(ret)) {
- r = PTR_ERR(ret);
- goto out;
- }
-
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
@@ -299,7 +290,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_alloc_access_page(vcpu->kvm);
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
if (ret)
return ret;
}
@@ -339,6 +336,60 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
put_cpu();
}
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
/*
* A fast-path version of avic_kick_target_vcpus(), which attempts to match
* destination APIC ID to vCPU without looping through all vCPUs.
@@ -346,11 +397,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 l1_physical_id, dest;
- struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
@@ -367,18 +417,18 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
return -EINVAL;
- l1_physical_id = dest;
-
- if (WARN_ON_ONCE(l1_physical_id != index))
+ if (WARN_ON_ONCE(dest != index))
return -EINVAL;
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
} else {
- u32 bitmap, cluster;
- int logid_index;
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
if (apic_x2apic_mode(source)) {
/* 16 bit dest mask, 16 bit cluster id */
- bitmap = dest & 0xFFFF0000;
+ bitmap = dest & 0xFFFF;
cluster = (dest >> 16) << 4;
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
/* 8 bit dest mask*/
@@ -390,67 +440,32 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
cluster = (dest >> 4) << 2;
}
+ /* Nothing to do if there are no destinations in the cluster. */
if (unlikely(!bitmap))
- /* guest bug: nobody to send the logical interrupt to */
return 0;
- if (!is_power_of_2(bitmap))
- /* multiple logical destinations, use slow path */
- return -EINVAL;
-
- logid_index = cluster + __ffs(bitmap);
-
- if (!apic_x2apic_mode(source)) {
- u32 *avic_logical_id_table =
- page_address(kvm_svm->avic_logical_id_table_page);
-
- u32 logid_entry = avic_logical_id_table[logid_index];
-
- if (WARN_ON_ONCE(index != logid_index))
- return -EINVAL;
-
- /* guest bug: non existing/reserved logical destination */
- if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
- return 0;
-
- l1_physical_id = logid_entry &
- AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
- } else {
- /*
- * For x2APIC logical mode, cannot leverage the index.
- * Instead, calculate physical ID from logical ID in ICRH.
- */
- int cluster = (icrh & 0xffff0000) >> 16;
- int apic = ffs(icrh & 0xffff) - 1;
-
- /*
- * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
- * contains anything but a single bit, we cannot use the
- * fast path, because it is limited to a single vCPU.
- */
- if (apic < 0 || icrh != (1 << apic))
- return -EINVAL;
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
- l1_physical_id = (cluster << 4) + apic;
- }
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
}
- target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
- if (unlikely(!target_vcpu))
- /* guest bug: non existing vCPU is a target of this IPI*/
- return 0;
-
- target_vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(target_vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
unsigned long i;
struct kvm_vcpu *vcpu;
@@ -466,21 +481,9 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
- u32 dest;
-
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_XAPIC_DEST_FIELD(icrh);
-
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
- dest, icrl & APIC_DEST_MASK)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- }
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
}
}
@@ -496,14 +499,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
* Emulate IPIs that are not handled by AVIC hardware, which
- * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
- * a trap, e.g. ICR holds the correct value and RIP has been
- * advanced, KVM is responsible only for emulating the IPI.
- * Sadly, hardware may sometimes leave the BUSY flag set, in
- * which case KVM needs to emulate the ICR write as well in
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
* order to clear the BUSY flag.
*/
if (icrl & APIC_ICR_BUSY)
@@ -519,8 +526,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
*/
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
- case AVIC_IPI_FAILURE_INVALID_TARGET:
- break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
break;
@@ -541,33 +546,33 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- int index;
u32 *logical_apic_id_table;
- int dlid = GET_APIC_LOGICAL_ID(ldr);
+ u32 cluster, index;
- if (!dlid)
- return NULL;
+ ldr = GET_APIC_LOGICAL_ID(ldr);
- if (flat) { /* flat */
- index = ffs(dlid) - 1;
- if (index > 7)
- return NULL;
- } else { /* cluster */
- int cluster = (dlid & 0xf0) >> 4;
- int apic = ffs(dlid & 0x0f) - 1;
-
- if ((apic < 0) || (apic > 7) ||
- (cluster >= 0xf))
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
return NULL;
- index = (cluster << 2) + apic;
+ ldr &= 0xf;
}
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -575,15 +580,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
entry = avic_get_logical_id_entry(vcpu, ldr, flat);
if (!entry)
- return -EINVAL;
+ return;
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
-
- return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
@@ -601,29 +604,23 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
-static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
/* AVIC does not support LDR update for x2APIC */
if (apic_x2apic_mode(vcpu->arch.apic))
- return 0;
+ return;
if (ldr == svm->ldr_reg)
- return 0;
+ return;
avic_invalidate_logical_id_entry(vcpu);
- if (ldr)
- ret = avic_ldr_write(vcpu, id, ldr);
-
- if (!ret)
- svm->ldr_reg = ldr;
-
- return ret;
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
}
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
@@ -645,12 +642,14 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
switch (offset) {
case APIC_LDR:
- if (avic_handle_ldr_update(vcpu))
- return 0;
+ avic_handle_ldr_update(vcpu);
break;
case APIC_DFR:
avic_handle_dfr_update(vcpu);
break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
default:
break;
}
@@ -739,18 +738,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
- return;
-
- if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
- WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
- return;
- }
- avic_refresh_apicv_exec_ctrl(vcpu);
-}
-
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
@@ -995,23 +982,6 @@ out:
return ret;
}
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_NESTED) |
- BIT(APICV_INHIBIT_REASON_IRQWIN) |
- BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
-
-
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
@@ -1064,6 +1034,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
@@ -1092,17 +1063,15 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-
-void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
- if (!enable_apicv)
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
return;
- if (activated) {
+ if (kvm_vcpu_apicv_active(vcpu)) {
/**
* During AVIC temporary deactivation, guest could update
* APIC ID, DFR and LDR registers, which would not be trapped
@@ -1116,6 +1085,16 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
@@ -1160,37 +1139,37 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
* - Hypervisor can support both xAVIC and x2AVIC in the same guest.
* - The mode can be switched at run-time.
*/
-bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
+bool avic_hardware_setup(void)
{
if (!npt_enabled)
return false;
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ return false;
+ }
+
if (boot_cpu_has(X86_FEATURE_AVIC)) {
- avic_mode = AVIC_MODE_X1;
pr_info("AVIC enabled\n");
} else if (force_avic) {
/*
* Some older systems does not advertise AVIC support.
* See Revision Guide for specific AMD processor for more detail.
*/
- avic_mode = AVIC_MODE_X1;
pr_warn("AVIC is not supported in CPUID but force enabled");
pr_warn("Your system might crash and burn");
}
/* AVIC is a prerequisite for x2AVIC. */
- if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
- if (avic_mode == AVIC_MODE_X1) {
- avic_mode = AVIC_MODE_X2;
- pr_info("x2AVIC enabled\n");
- } else {
- pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
- pr_warn(FW_BUG "Try enable AVIC using force_avic option");
- }
- }
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled)
+ pr_info("x2AVIC enabled\n");
- if (avic_mode != AVIC_MODE_NONE)
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- return !!avic_mode;
+ return true;
}
diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
new file mode 100644
index 000000000000..088f6429b24c
--- /dev/null
+++ b/arch/x86/kvm/svm/hyperv.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD SVM specific code for Hyper-V on KVM.
+ *
+ * Copyright 2022 Red Hat, Inc. and/or its affiliates.
+ */
+#include "hyperv.h"
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
+ svm->vmcb->control.exit_info_2 = 0;
+ nested_svm_vmexit(svm);
+}
diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h
index 7d6d97968fb9..02f4784b5d44 100644
--- a/arch/x86/kvm/svm/hyperv.h
+++ b/arch/x86/kvm/svm/hyperv.h
@@ -9,27 +9,37 @@
#include <asm/mshyperv.h>
#include "../hyperv.h"
+#include "svm.h"
-/*
- * Hyper-V uses the software reserved 32 bytes in VMCB
- * control area to expose SVM enlightenments to guests.
- */
-struct hv_enlightenments {
- struct __packed hv_enlightenments_control {
- u32 nested_flush_hypercall:1;
- u32 msr_bitmap:1;
- u32 enlightened_npt_tlb: 1;
- u32 reserved:29;
- } __packed hv_enlightenments_control;
- u32 hv_vp_id;
- u64 hv_vm_id;
- u64 partition_assist_page;
- u64 reserved;
-} __packed;
+static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
-/*
- * Hyper-V uses the software reserved clean bit in VMCB
- */
-#define VMCB_HV_NESTED_ENLIGHTENMENTS VMCB_SW
+ if (!hv_vcpu)
+ return;
+
+ hv_vcpu->nested.pa_page_gpa = hve->partition_assist_page;
+ hv_vcpu->nested.vm_id = hve->hv_vm_id;
+ hv_vcpu->nested.vp_id = hve->hv_vp_id;
+}
+
+static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ if (!hv_vcpu)
+ return false;
+
+ if (!hve->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
#endif /* __ARCH_X86_KVM_SVM_HYPERV_H__ */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4c620999d230..96936ddf1b3c 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -25,6 +25,7 @@
#include "trace.h"
#include "mmu.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "lapic.h"
#include "svm.h"
@@ -137,20 +138,27 @@ void recalc_intercepts(struct vcpu_svm *svm)
c->intercepts[i] = h->intercepts[i];
if (g->int_ctl & V_INTR_MASKING_MASK) {
- /* We only want the cr8 intercept bits of L1 */
- vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
- vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
-
/*
- * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
- * affect any interrupt we may want to inject; therefore,
- * interrupt window vmexits are irrelevant to L0.
+ * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
+ * disable intercept of CR8 writes as L2's CR8 does not affect
+ * any interrupt KVM may want to inject.
+ *
+ * Similarly, disable intercept of virtual interrupts (used to
+ * detect interrupt windows) if the saved RFLAGS.IF is '0', as
+ * the effective RFLAGS.IF for L1 interrupts will never be set
+ * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
*/
- vmcb_clr_intercept(c, INTERCEPT_VINTR);
+ vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
+ if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
+ vmcb_clr_intercept(c, INTERCEPT_VINTR);
}
- /* We don't want to see VMMCALLs from a nested guest */
- vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
+ /*
+ * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
+ * flush feature is enabled.
+ */
+ if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
+ vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
for (i = 0; i < MAX_INTERCEPT; i++)
c->intercepts[i] |= g->intercepts[i];
@@ -179,8 +187,7 @@ void recalc_intercepts(struct vcpu_svm *svm)
*/
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
int i;
/*
@@ -194,7 +201,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
if (!svm->nested.force_msr_bitmap_recalc &&
kvm_hv_hypercall_enabled(&svm->vcpu) &&
hve->hv_enlightenments_control.msr_bitmap &&
- (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
+ (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
goto set_msrpm_base_pa;
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
@@ -274,6 +281,11 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
return false;
+ if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
+ !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
+ return false;
+ }
+
return true;
}
@@ -369,8 +381,8 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
/* Hyper-V extensions (Enlightened VMCB) */
if (kvm_hv_hypercall_enabled(vcpu)) {
to->clean = from->clean;
- memcpy(to->reserved_sw, from->reserved_sw,
- sizeof(struct hv_enlightenments));
+ memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
+ sizeof(to->hv_enlightenments));
}
}
@@ -414,22 +426,24 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
/* Only a few fields of int_ctl are written by the processor. */
mask = V_IRQ_MASK | V_TPR_MASK;
- if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
- svm_is_intercept(svm, INTERCEPT_VINTR)) {
- /*
- * In order to request an interrupt window, L0 is usurping
- * svm->vmcb->control.int_ctl and possibly setting V_IRQ
- * even if it was clear in L1's VMCB. Restoring it would be
- * wrong. However, in this case V_IRQ will remain true until
- * interrupt_window_interception calls svm_clear_vintr and
- * restores int_ctl. We can just leave it aside.
- */
+ /*
+ * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
+ * virtual interrupts in order to request an interrupt window, as KVM
+ * has usurped vmcb02's int_ctl. If an interrupt window opens before
+ * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
+ * If no window opens, V_IRQ will be correctly preserved in vmcb12's
+ * int_ctl (because it was never recognized while L2 was running).
+ */
+ if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
+ !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
mask &= ~V_IRQ_MASK;
- }
if (nested_vgif_enabled(svm))
mask |= V_GIF_MASK;
+ if (nested_vnmi_enabled(svm))
+ mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -474,6 +488,15 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && npt_enabled)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* TODO: optimize unconditional TLB flush/MMU sync. A partial list of
* things to fix before this can be conditional:
*
@@ -640,6 +663,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
else
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ if (vnmi) {
+ if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
+ svm->vcpu.arch.nmi_pending++;
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+ }
+ if (nested_vnmi_enabled(svm))
+ int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
+ V_NMI_ENABLE_MASK |
+ V_NMI_BLOCKING_MASK);
+ }
+
/* Copied from vmcb01. msrpm_base can be overwritten later. */
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
@@ -800,6 +834,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
if (kvm_vcpu_apicv_active(vcpu))
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ nested_svm_hv_update_vm_vp_ids(vcpu);
+
return 0;
}
@@ -822,6 +858,13 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
return 1;
}
+ /* This fails when VP assist page is enabled but the supplied GPA is bogus */
+ ret = kvm_hv_verify_vp_assist(vcpu);
+ if (ret) {
+ kvm_inject_gp(vcpu, 0);
+ return ret;
+ }
+
vmcb12_gpa = svm->vmcb->save.rax;
ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
if (ret == -EINVAL) {
@@ -988,7 +1031,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
- vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
@@ -1002,6 +1044,28 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_switch_vmcb(svm, &svm->vmcb01);
+ /*
+ * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
+ *
+ * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
+ * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
+ * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
+ * virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
+ * KVM re-requests an interrupt window if necessary, which implicitly
+ * copies this bits from vmcb02 to vmcb01.
+ *
+ * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
+ * is stored in vmcb02, but its value doesn't need to be copied from/to
+ * vmcb01 because it is copied from/to the virtual APIC's TPR register
+ * on each VM entry/exit.
+ *
+ * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
+ * V_GIF. However, GIF is architecturally clear on each VM exit, thus
+ * there is no need to copy V_GIF from vmcb02 to vmcb01.
+ */
+ if (!nested_exit_on_intr(svm))
+ kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
+
if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
svm_copy_lbrs(vmcb12, vmcb02);
svm_update_lbrv(vcpu);
@@ -1010,6 +1074,20 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
svm_update_lbrv(vcpu);
}
+ if (vnmi) {
+ if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
+ vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ if (vcpu->arch.nmi_pending) {
+ vcpu->arch.nmi_pending--;
+ vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
+ } else {
+ vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
+ }
+ }
+
/*
* On vmexit the GIF is set to false and
* no event can be injected in L1.
@@ -1084,13 +1162,19 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
* to benefit from it right away.
*/
if (kvm_apicv_activated(vcpu->kvm))
- kvm_vcpu_update_apicv(vcpu);
+ __kvm_vcpu_update_apicv(vcpu);
return 0;
}
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
+ return;
+
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
}
@@ -1125,6 +1209,9 @@ void svm_free_nested(struct vcpu_svm *svm)
if (!svm->nested.initialized)
return;
+ if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
+ svm_switch_vmcb(svm, &svm->vmcb01);
+
svm_vcpu_free_msrpm(svm->nested.msrpm);
svm->nested.msrpm = NULL;
@@ -1143,9 +1230,6 @@ void svm_free_nested(struct vcpu_svm *svm)
svm->nested.initialized = false;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void svm_leave_nested(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1377,6 +1461,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
if (block_nested_events)
return -EBUSY;
@@ -1385,6 +1470,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
return 0;
}
+#endif
if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
if (block_nested_events)
@@ -1411,6 +1497,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
switch (exit_code) {
case SVM_EXIT_INTR:
@@ -1429,6 +1516,13 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
}
+ case SVM_EXIT_VMMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_svm_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu))
+ return NESTED_EXIT_HOST;
+ break;
default:
break;
}
@@ -1479,7 +1573,7 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
dst->virt_ext = from->virt_ext;
dst->pause_filter_count = from->pause_filter_count;
dst->pause_filter_thresh = from->pause_filter_thresh;
- /* 'clean' and 'reserved_sw' are not changed by KVM */
+ /* 'clean' and 'hv_enlightenments' are not changed by KVM */
}
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
@@ -1709,6 +1803,9 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
return false;
}
+ if (kvm_hv_verify_vp_assist(vcpu))
+ return false;
+
return true;
}
@@ -1720,4 +1817,5 @@ struct kvm_x86_nested_ops svm_nested_ops = {
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
.set_state = svm_set_nested_state,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 9d65cd095691..5fa939e411d8 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -9,6 +9,8 @@
*
* Implementation is based on pmu_intel.c file
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -159,7 +161,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~pmu->reserved_bits;
if (data != pmc->eventsel) {
pmc->eventsel = data;
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
return 0;
}
@@ -212,7 +214,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmc *pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
}
@@ -229,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.reset = amd_pmu_reset,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index efaaef2b7ae1..69ae5e1b3120 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -6,11 +6,13 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/psp.h>
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
@@ -465,9 +467,9 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages)
return;
for (i = 0; i < npages; i++) {
- page_virtual = kmap_atomic(pages[i]);
+ page_virtual = kmap_local_page(pages[i]);
clflush_cache_range(page_virtual, PAGE_SIZE);
- kunmap_atomic(page_virtual);
+ kunmap_local(page_virtual);
cond_resched();
}
}
@@ -812,7 +814,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
@@ -1293,7 +1295,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
/* Pin guest memory */
@@ -1473,7 +1475,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
@@ -1766,18 +1768,20 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
- struct file *source_kvm_file;
+ struct fd f = fdget(source_fd);
struct kvm *source_kvm;
bool charged = false;
int ret;
- source_kvm_file = fget(source_fd);
- if (!file_is_kvm(source_kvm_file)) {
+ if (!f.file)
+ return -EBADF;
+
+ if (!file_is_kvm(f.file)) {
ret = -EBADF;
goto out_fput;
}
- source_kvm = source_kvm_file->private_data;
+ source_kvm = f.file->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
@@ -1827,8 +1831,7 @@ out_dst_cgroup:
out_unlock:
sev_unlock_two_vms(kvm, source_kvm);
out_fput:
- if (source_kvm_file)
- fput(source_kvm_file);
+ fdput(f);
return ret;
}
@@ -2045,18 +2048,20 @@ failed:
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
- struct file *source_kvm_file;
+ struct fd f = fdget(source_fd);
struct kvm *source_kvm;
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
- source_kvm_file = fget(source_fd);
- if (!file_is_kvm(source_kvm_file)) {
+ if (!f.file)
+ return -EBADF;
+
+ if (!file_is_kvm(f.file)) {
ret = -EBADF;
goto e_source_fput;
}
- source_kvm = source_kvm_file->private_data;
+ source_kvm = f.file->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto e_source_fput;
@@ -2102,8 +2107,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
e_unlock:
sev_unlock_two_vms(kvm, source_kvm);
e_source_fput:
- if (source_kvm_file)
- fput(source_kvm_file);
+ fdput(f);
return ret;
}
@@ -2648,7 +2652,7 @@ static int setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len)
ghcb_scratch_beg = control->ghcb_gpa +
offsetof(struct ghcb, shared_buffer);
ghcb_scratch_end = control->ghcb_gpa +
- offsetof(struct ghcb, reserved_1);
+ offsetof(struct ghcb, reserved_0xff0);
/*
* If the scratch area begins within the GHCB, it must be
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9f88c8e6766e..54089f990c8f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1,4 +1,4 @@
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
@@ -6,6 +6,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "smm.h"
#include "cpuid.h"
#include "pmu.h"
@@ -26,6 +27,7 @@
#include <linux/swap.h>
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
+#include <linux/smp.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -40,6 +42,9 @@
#include <asm/fpu/api.h>
#include <asm/virtext.h>
+
+#include <trace/events/ipi.h>
+
#include "trace.h"
#include "svm.h"
@@ -94,6 +99,7 @@ static const struct svm_direct_access_msrs {
#endif
{ .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
+ { .index = MSR_IA32_FLUSH_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
@@ -229,6 +235,8 @@ module_param(dump_invalid_vmcb, bool, 0644);
bool intercept_smi = true;
module_param(intercept_smi, bool, 0444);
+bool vnmi = true;
+module_param(vnmi, bool, 0444);
static bool svm_gp_erratum_intercept = true;
@@ -346,12 +354,6 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return 0;
}
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -524,21 +526,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static int has_svm(void)
+static bool kvm_is_svm_supported(void)
{
+ int cpu = raw_smp_processor_id();
const char *msg;
+ u64 vm_cr;
if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
+ pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ return false;
}
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
- return 0;
+ return false;
}
- return 1;
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
}
void __svm_write_tsc_multiplier(u64 multiplier)
@@ -577,10 +595,6 @@ static int svm_hardware_enable(void)
if (efer & EFER_SVME)
return -EBUSY;
- if (!has_svm()) {
- pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- return -EINVAL;
- }
sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
@@ -818,7 +832,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (intercept == svm->x2avic_msrs_intercepted)
return;
- if (avic_mode != AVIC_MODE_X2 ||
+ if (!x2avic_enabled ||
!apic_x2apic_mode(svm->vcpu.arch.apic))
return;
@@ -1304,6 +1318,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (kvm_vcpu_apicv_active(vcpu))
avic_init_vmcb(svm, vmcb);
+ if (vnmi)
+ svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK;
+
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
svm_clr_intercept(svm, INTERCEPT_CLGI);
@@ -1331,6 +1348,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.microcode_version = 0x01000065;
svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+
if (sev_es_guest(vcpu->kvm))
sev_es_vcpu_reset(svm);
}
@@ -1438,6 +1458,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
*/
svm_clear_current_vmcb(svm->vmcb);
+ svm_leave_nested(vcpu);
svm_free_nested(svm);
sev_free_vcpu(vcpu);
@@ -1573,6 +1594,16 @@ static void svm_set_vintr(struct vcpu_svm *svm)
svm_set_intercept(svm, INTERCEPT_VINTR);
/*
+ * Recalculating intercepts may have cleared the VINTR intercept. If
+ * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF
+ * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN.
+ * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as
+ * interrupts will never be unblocked while L2 is running.
+ */
+ if (!svm_is_intercept(svm, INTERCEPT_VINTR))
+ return;
+
+ /*
* This is just a dummy VINTR to actually cause a vmexit to happen.
* Actual injection of virtual interrupts happens through EVENTINJ.
*/
@@ -2080,7 +2111,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu)
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ pr_err("Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -2469,16 +2500,29 @@ static int task_switch_interception(struct kvm_vcpu *vcpu)
has_error_code, error_code);
}
+static void svm_clr_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_clr_intercept(svm, INTERCEPT_IRET);
+}
+
+static void svm_set_iret_intercept(struct vcpu_svm *svm)
+{
+ if (!sev_es_guest(svm->vcpu.kvm))
+ svm_set_intercept(svm, INTERCEPT_IRET);
+}
+
static int iret_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
++vcpu->stat.nmi_window_exits;
- vcpu->arch.hflags |= HF_IRET_MASK;
- if (!sev_es_guest(vcpu->kvm)) {
- svm_clr_intercept(svm, INTERCEPT_IRET);
+ svm->awaiting_iret_completion = true;
+
+ svm_clr_iret_intercept(svm);
+ if (!sev_es_guest(vcpu->kvm))
svm->nmi_iret_rip = kvm_rip_read(vcpu);
- }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
@@ -2709,12 +2753,10 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr)
msr->data = 0;
switch (msr->index) {
- case MSR_F10H_DECFG:
- if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
- case MSR_IA32_PERF_CAPABILITIES:
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -2812,7 +2854,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0x1E;
}
break;
- case MSR_F10H_DECFG:
+ case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
default:
@@ -2863,7 +2905,7 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- int r;
+ int ret = 0;
u32 ecx = msr->index;
u64 data = msr->data;
@@ -2933,21 +2975,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
*/
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
break;
- case MSR_IA32_PRED_CMD:
- if (!msr->host_initiated &&
- !guest_has_pred_cmd_msr(vcpu))
- return 1;
-
- if (data & ~PRED_CMD_IBPB)
- return 1;
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- return 1;
- if (!data)
- break;
-
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
- break;
case MSR_AMD64_VIRT_SPEC_CTRL:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
@@ -3000,17 +3027,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* guest via direct_access_msrs, and switch it via user return.
*/
preempt_disable();
- r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
+ ret = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull);
preempt_enable();
- if (r)
- return 1;
+ if (ret)
+ break;
svm->tsc_aux = data;
break;
case MSR_IA32_DEBUGCTLMSR:
if (!lbrv) {
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
if (data & DEBUGCTL_RESERVED_BITS)
@@ -3039,9 +3065,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
- case MSR_F10H_DECFG: {
+ case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
@@ -3062,7 +3088,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
default:
return kvm_set_msr_common(vcpu, msr);
}
- return 0;
+ return ret;
}
static int msr_interception(struct kvm_vcpu *vcpu)
@@ -3425,15 +3451,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
@@ -3481,10 +3498,42 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
if (svm->nmi_l1_to_l2)
return;
- vcpu->arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
+ svm->nmi_masked = true;
+ svm_set_iret_intercept(svm);
+ ++vcpu->stat.nmi_injections;
+}
+
+static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
+}
+
+static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!is_vnmi_enabled(svm))
+ return false;
+
+ if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK)
+ return false;
+
+ svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK;
+ vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
+
+ /*
+ * Because the pending NMI is serviced by hardware, KVM can't know when
+ * the NMI is "injected", but for all intents and purposes, passing the
+ * NMI off to hardware counts as injection.
+ */
++vcpu->stat.nmi_injections;
+
+ return true;
}
static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
@@ -3582,11 +3631,39 @@ static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
svm_set_intercept(svm, INTERCEPT_CR8_WRITE);
}
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm))
+ return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK;
+ else
+ return svm->nmi_masked;
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (is_vnmi_enabled(svm)) {
+ if (masked)
+ svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK;
+ else
+ svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
+
+ } else {
+ svm->nmi_masked = masked;
+ if (masked)
+ svm_set_iret_intercept(svm);
+ else
+ svm_clr_iret_intercept(svm);
+ }
+}
+
bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- bool ret;
if (!gif_set(svm))
return true;
@@ -3594,10 +3671,10 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return false;
- ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vcpu->arch.hflags & HF_NMI_MASK);
+ if (svm_get_nmi_mask(vcpu))
+ return true;
- return ret;
+ return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3615,26 +3692,6 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
-{
- return !!(vcpu->arch.hflags & HF_NMI_MASK);
-}
-
-static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- if (masked) {
- vcpu->arch.hflags |= HF_NMI_MASK;
- if (!sev_es_guest(vcpu->kvm))
- svm_set_intercept(svm, INTERCEPT_IRET);
- } else {
- vcpu->arch.hflags &= ~HF_NMI_MASK;
- if (!sev_es_guest(vcpu->kvm))
- svm_clr_intercept(svm, INTERCEPT_IRET);
- }
-}
-
bool svm_interrupt_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3715,7 +3772,16 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
+ /*
+ * KVM should never request an NMI window when vNMI is enabled, as KVM
+ * allows at most one to-be-injected NMI and one pending NMI, i.e. if
+ * two NMIs arrive simultaneously, KVM will inject one and set
+ * V_NMI_PENDING for the other. WARN, but continue with the standard
+ * single-step approach to try and salvage the pending NMI.
+ */
+ WARN_ON_ONCE(is_vnmi_enabled(svm));
+
+ if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -3733,11 +3799,18 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
/*
+ * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries.
+ * A TLB flush for the current ASID flushes both "host" and "guest" TLB
+ * entries, and thus is a superset of Hyper-V's fine grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
+
+ /*
* Flush only the current ASID even if the TLB flush was invoked via
* kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all
* ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and
@@ -3750,6 +3823,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation--;
}
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_flush_remote_tlbs() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_flush_remote_tlbs(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3832,10 +3936,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ if (svm->awaiting_iret_completion &&
(sev_es_guest(vcpu->kvm) ||
kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
- vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ svm->awaiting_iret_completion = false;
+ svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -3903,8 +4008,14 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ /*
+ * Note, the next RIP must be provided as SRCU isn't held, i.e. KVM
+ * can't read guest memory (dereference memslots) to decode the WRMSR.
+ */
+ if (control->exit_code == SVM_EXIT_MSR && control->exit_info_1 &&
+ nrips && control->next_rip)
return handle_fastpath_set_msr_irqoff(vcpu);
return EXIT_FASTPATH_NONE;
@@ -4078,17 +4189,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
@@ -4100,11 +4200,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
-static int __init svm_check_processor_compat(void)
-{
- return 0;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4113,9 +4208,11 @@ static bool svm_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return false;
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/* SEV-ES guests do not support SMM, so report false */
if (kvm && sev_es_guest(kvm))
return false;
@@ -4153,8 +4250,18 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
+ svm->vnmi_enabled = vnmi && guest_cpuid_has(vcpu, X86_FEATURE_VNMI);
+
svm_recalc_instruction_intercepts(vcpu, svm);
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0,
+ !!guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_FLUSH_CMD, 0,
+ !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+
/* For sev guests, the memory encryption bit is not reserved in CR3. */
if (sev_guest(vcpu->kvm)) {
best = kvm_find_cpuid_entry(vcpu, 0x8000001F);
@@ -4372,6 +4479,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+#ifdef CONFIG_KVM_SMM
bool svm_smi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4399,7 +4507,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return 1;
}
-static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map_save;
@@ -4408,10 +4516,16 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
if (!is_guest_mode(vcpu))
return 0;
- /* FED8h - SVM Guest */
- put_smstate(u64, smstate, 0x7ed8, 1);
- /* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+ /*
+ * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is
+ * responsible for ensuring nested SVM and SMIs are mutually exclusive.
+ */
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 1;
+
+ smram->smram64.svm_guest_flag = 1;
+ smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4433,8 +4547,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
* that, see svm_prepare_switch_to_guest()) which must be
* preserved.
*/
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
- &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
return 1;
BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
@@ -4446,34 +4559,33 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map, map_save;
- u64 saved_efer, vmcb12_gpa;
struct vmcb *vmcb12;
int ret;
+ const struct kvm_smram_state_64 *smram64 = &smram->smram64;
+
if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
return 0;
/* Non-zero if SMI arrived while vCPU was in guest mode. */
- if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ if (!smram64->svm_guest_flag)
return 0;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
return 1;
- saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
- if (!(saved_efer & EFER_SVME))
+ if (!(smram64->efer & EFER_SVME))
return 1;
- vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(smram64->svm_guest_vmcb_gpa), &map))
return 1;
ret = 1;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save))
goto unmap_map;
if (svm_allocate_nested(svm))
@@ -4495,7 +4607,7 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
vmcb12 = map.hva;
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
- ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+ ret = enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, vmcb12, false);
if (ret)
goto unmap_save;
@@ -4521,12 +4633,12 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
/* We must be in SMM; RSM will cause a vmexit anyway. */
}
}
+#endif
static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
{
bool smep, smap, is_user;
- unsigned long cr4;
u64 error_code;
/* Emulation is always possible when KVM has access to all guest state. */
@@ -4618,12 +4730,11 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
goto resume_guest;
- cr4 = kvm_read_cr4(vcpu);
- smep = cr4 & X86_CR4_SMEP;
- smap = cr4 & X86_CR4_SMAP;
+ smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP);
+ smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP);
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
/*
* If the fault occurred in userspace, arbitrarily inject #GP
@@ -4695,7 +4806,9 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .name = "kvm_amd",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
@@ -4741,10 +4854,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_all = svm_flush_tlb_all,
.flush_tlb_current = svm_flush_tlb_current,
.flush_tlb_gva = svm_flush_tlb_gva,
- .flush_tlb_guest = svm_flush_tlb_current,
+ .flush_tlb_guest = svm_flush_tlb_asid,
.vcpu_pre_run = svm_vcpu_pre_run,
.vcpu_run = svm_vcpu_run,
@@ -4756,6 +4869,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.patch_hypercall = svm_patch_hypercall,
.inject_irq = svm_inject_irq,
.inject_nmi = svm_inject_nmi,
+ .is_vnmi_pending = svm_is_vnmi_pending,
+ .set_vnmi_pending = svm_set_vnmi_pending,
.inject_exception = svm_inject_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
@@ -4765,10 +4880,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = avic_set_virtual_apic_mode,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
@@ -4796,10 +4911,12 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.pi_update_irte = avic_pi_update_irte,
.setup_mce = svm_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = svm_smi_allowed,
.enter_smm = svm_enter_smm,
.leave_smm = svm_leave_smm,
.enable_smi_window = svm_enable_smi_window,
+#endif
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
@@ -4865,6 +4982,7 @@ static __init void svm_set_cpu_caps(void)
{
kvm_set_cpu_caps();
+ kvm_caps.supported_perf_cap = 0;
kvm_caps.supported_xss = 0;
/* CPUID 0x80000001 and 0x8000000A (SVM features) */
@@ -4895,6 +5013,9 @@ static __init void svm_set_cpu_caps(void)
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
+ if (vnmi)
+ kvm_cpu_cap_set(X86_FEATURE_VNMI);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -4960,6 +5081,9 @@ static __init int svm_hardware_setup(void)
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+ if (boot_cpu_has(X86_FEATURE_AUTOIBRS))
+ kvm_enable_efer_bits(EFER_AUTOIBRS);
+
/* Check for pause filtering support */
if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
pause_filter_count = 0;
@@ -4969,7 +5093,7 @@ static __init int svm_hardware_setup(void)
}
if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
@@ -4987,7 +5111,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@ -5013,12 +5137,14 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
+ enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ } else if (!x2avic_enabled) {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
}
if (vls) {
@@ -5041,6 +5167,16 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI);
+ if (vnmi)
+ pr_info("Virtual NMI enabled\n");
+
+ if (!vnmi) {
+ svm_x86_ops.is_vnmi_pending = NULL;
+ svm_x86_ops.set_vnmi_pending = NULL;
+ }
+
+
if (lbrv) {
if (!boot_cpu_has(X86_FEATURE_LBRV))
lbrv = false;
@@ -5077,10 +5213,7 @@ err:
static struct kvm_x86_init_ops svm_init_ops __initdata = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
.pmu_ops = &amd_pmu_ops,
@@ -5088,15 +5221,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ int r;
+
__unused_size_checks();
- return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
+ kvm_x86_vendor_exit();
}
module_init(svm_init)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 199a2ecef1ce..f44751dd8d5d 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -35,14 +35,8 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
-
-enum avic_modes {
- AVIC_MODE_NONE = 0,
- AVIC_MODE_X1,
- AVIC_MODE_X2,
-};
-
-extern enum avic_modes avic_mode;
+extern bool x2avic_enabled;
+extern bool vnmi;
/*
* Clean bits in VMCB.
@@ -151,7 +145,10 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
- u8 reserved_sw[32];
+ union {
+ struct hv_vmcb_enlightenments hv_enlightenments;
+ u8 reserved_sw[32];
+ };
};
struct svm_nested_state {
@@ -234,8 +231,26 @@ struct vcpu_svm {
struct svm_nested_state nested;
+ /* NMI mask value, used when vNMI is not enabled */
+ bool nmi_masked;
+
+ /*
+ * True when NMIs are still masked but guest IRET was just intercepted
+ * and KVM is waiting for RIP to change, which will signal that the
+ * intercepted IRET was retired and thus NMI can be unmasked.
+ */
+ bool awaiting_iret_completion;
+
+ /*
+ * Set when KVM is awaiting IRET completion and needs to inject NMIs as
+ * soon as the IRET completes (e.g. NMI is pending injection). KVM
+ * temporarily steals RFLAGS.TF to single-step the guest in this case
+ * in order to regain control as soon as the NMI-blocking condition
+ * goes away.
+ */
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
+
bool nmi_l1_to_l2;
unsigned long soft_int_csbase;
@@ -251,6 +266,7 @@ struct vcpu_svm {
bool pause_filter_enabled : 1;
bool pause_threshold_enabled : 1;
bool vgif_enabled : 1;
+ bool vnmi_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -277,6 +293,9 @@ struct vcpu_svm {
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
};
struct svm_cpu_data {
@@ -494,7 +513,7 @@ static inline void enable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl |= V_GIF_MASK;
else
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ svm->guest_gif = true;
}
static inline void disable_gif(struct vcpu_svm *svm)
@@ -504,7 +523,7 @@ static inline void disable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl &= ~V_GIF_MASK;
else
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ svm->guest_gif = false;
}
static inline bool gif_set(struct vcpu_svm *svm)
@@ -514,7 +533,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
if (vmcb)
return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+ return svm->guest_gif;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
@@ -522,6 +541,12 @@ static inline bool nested_npt_enabled(struct vcpu_svm *svm)
return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
}
+static inline bool nested_vnmi_enabled(struct vcpu_svm *svm)
+{
+ return svm->vnmi_enabled &&
+ (svm->nested.ctl.int_ctl & V_NMI_ENABLE_MASK);
+}
+
static inline bool is_x2apic_msrpm_offset(u32 offset)
{
/* 4 msrs per u8, and 4 u8 in u32 */
@@ -531,6 +556,27 @@ static inline bool is_x2apic_msrpm_offset(u32 offset)
(msr < (APIC_BASE_MSR + 0x100));
}
+static inline struct vmcb *get_vnmi_vmcb_l1(struct vcpu_svm *svm)
+{
+ if (!vnmi)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu))
+ return NULL;
+ else
+ return svm->vmcb01.ptr;
+}
+
+static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
+{
+ struct vmcb *vmcb = get_vnmi_vmcb_l1(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_NMI_ENABLE_MASK);
+ else
+ return false;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
@@ -625,8 +671,23 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-
-bool avic_hardware_setup(struct kvm_x86_ops *ops);
+#define AVIC_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_NESTED) | \
+ BIT(APICV_INHIBIT_REASON_IRQWIN) | \
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_SEV) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+)
+
+bool avic_hardware_setup(void);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -638,14 +699,13 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 8cdc62c74a96..7af8422d3382 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V for SVM.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
@@ -14,9 +15,9 @@
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve;
+ struct hv_vmcb_enlightenments *hve;
struct hv_partition_assist_pg **p_hv_pa_pg =
&to_kvm_hv(vcpu->kvm)->hv_pa_pg;
@@ -26,13 +27,13 @@ int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
if (!*p_hv_pa_pg)
return -ENOMEM;
- hve = (struct hv_enlightenments *)to_svm(vcpu)->vmcb->control.reserved_sw;
+ hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
hve->partition_assist_page = __pa(*p_hv_pa_pg);
hve->hv_vm_id = (unsigned long)vcpu->kvm;
if (!hve->hv_enlightenments_control.nested_flush_hypercall) {
hve->hv_enlightenments_control.nested_flush_hypercall = 1;
- vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(to_svm(vcpu)->vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
return 0;
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index e2fc59380465..f85bc617ffe4 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -6,6 +6,8 @@
#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#include <asm/mshyperv.h>
+
#if IS_ENABLED(CONFIG_HYPERV)
#include "kvm_onhyperv.h"
@@ -13,12 +15,22 @@
static struct kvm_x86_ops svm_x86_ops;
-int svm_hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu);
+int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
+
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
+
+ BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) !=
+ sizeof(vmcb->control.reserved_sw));
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
@@ -28,20 +40,19 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
hve->hv_enlightenments_control.msr_bitmap = 1;
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
- pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
- svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- svm_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
+ svm_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ svm_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
int cpu;
- pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
for_each_online_cpu(cpu) {
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(cpu);
@@ -51,8 +62,8 @@ static inline void svm_hv_hardware_setup(void)
vp_ap->nested_control.features.directhypercall = 1;
}
- svm_x86_ops.enable_direct_tlbflush =
- svm_hv_enable_direct_tlbflush;
+ svm_x86_ops.enable_l2_tlb_flush =
+ svm_hv_enable_l2_tlb_flush;
}
}
@@ -60,32 +71,34 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct kvm_vcpu *vcpu)
{
struct vmcb *vmcb = to_svm(vcpu)->vmcb;
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
if (hve->hv_enlightenments_control.msr_bitmap)
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
-static inline void svm_hv_update_vp_id(struct vmcb *vmcb,
- struct kvm_vcpu *vcpu)
+static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
{
- struct hv_enlightenments *hve =
- (struct hv_enlightenments *)vmcb->control.reserved_sw;
+ struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
u32 vp_index = kvm_hv_get_vpindex(vcpu);
if (hve->hv_vp_id != vp_index) {
hve->hv_vp_id = vp_index;
- vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
+ vmcb_mark_dirty(vmcb, HV_VMCB_NESTED_ENLIGHTENMENTS);
}
}
#else
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
}
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 34367dc203f2..8e8295e774f0 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/asm-offsets.h>
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bc25589ad588..83843379813e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -113,12 +113,13 @@ TRACE_EVENT(kvm_hv_hypercall_done,
* Tracepoint for Xen hypercall.
*/
TRACE_EVENT(kvm_xen_hypercall,
- TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3, unsigned long a4,
- unsigned long a5),
- TP_ARGS(nr, a0, a1, a2, a3, a4, a5),
+ TP_PROTO(u8 cpl, unsigned long nr,
+ unsigned long a0, unsigned long a1, unsigned long a2,
+ unsigned long a3, unsigned long a4, unsigned long a5),
+ TP_ARGS(cpl, nr, a0, a1, a2, a3, a4, a5),
TP_STRUCT__entry(
+ __field(u8, cpl)
__field(unsigned long, nr)
__field(unsigned long, a0)
__field(unsigned long, a1)
@@ -129,6 +130,7 @@ TRACE_EVENT(kvm_xen_hypercall,
),
TP_fast_assign(
+ __entry->cpl = cpl;
__entry->nr = nr;
__entry->a0 = a0;
__entry->a1 = a1;
@@ -138,8 +140,9 @@ TRACE_EVENT(kvm_xen_hypercall,
__entry->a4 = a5;
),
- TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
- __entry->nr, __entry->a0, __entry->a1, __entry->a2,
+ TP_printk("cpl %d nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx a4 0x%lx a5 %lx",
+ __entry->cpl, __entry->nr,
+ __entry->a0, __entry->a1, __entry->a2,
__entry->a3, __entry->a4, __entry->a5)
);
@@ -1547,38 +1550,41 @@ TRACE_EVENT(kvm_hv_timer_state,
* Tracepoint for kvm_hv_flush_tlb.
*/
TRACE_EVENT(kvm_hv_flush_tlb,
- TP_PROTO(u64 processor_mask, u64 address_space, u64 flags),
- TP_ARGS(processor_mask, address_space, flags),
+ TP_PROTO(u64 processor_mask, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(processor_mask, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, processor_mask)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
__entry->processor_mask = processor_mask;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
- TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx",
+ TP_printk("processor_mask 0x%llx address_space 0x%llx flags 0x%llx %s",
__entry->processor_mask, __entry->address_space,
- __entry->flags)
+ __entry->flags, __entry->guest_mode ? "(L2)" : "")
);
/*
* Tracepoint for kvm_hv_flush_tlb_ex.
*/
TRACE_EVENT(kvm_hv_flush_tlb_ex,
- TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags),
- TP_ARGS(valid_bank_mask, format, address_space, flags),
+ TP_PROTO(u64 valid_bank_mask, u64 format, u64 address_space, u64 flags, bool guest_mode),
+ TP_ARGS(valid_bank_mask, format, address_space, flags, guest_mode),
TP_STRUCT__entry(
__field(u64, valid_bank_mask)
__field(u64, format)
__field(u64, address_space)
__field(u64, flags)
+ __field(bool, guest_mode)
),
TP_fast_assign(
@@ -1586,12 +1592,14 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex,
__entry->format = format;
__entry->address_space = address_space;
__entry->flags = flags;
+ __entry->guest_mode = guest_mode;
),
TP_printk("valid_bank_mask 0x%llx format 0x%llx "
- "address_space 0x%llx flags 0x%llx",
+ "address_space 0x%llx flags 0x%llx %s",
__entry->valid_bank_mask, __entry->format,
- __entry->address_space, __entry->flags)
+ __entry->address_space, __entry->flags,
+ __entry->guest_mode ? "(L2)" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 07254314f3dd..45162c1bcd8f 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -66,13 +66,13 @@ struct vmcs_config {
u64 misc;
struct nested_vmx_msrs nested;
};
-extern struct vmcs_config vmcs_config;
+extern struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability {
u32 ept;
u32 vpid;
};
-extern struct vmx_capability vmx_capability;
+extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
@@ -395,30 +395,6 @@ static inline bool vmx_pebs_supported(void)
return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept;
}
-static inline u64 vmx_get_perf_capabilities(void)
-{
- u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
- u64 host_perf_cap = 0;
-
- if (!enable_pmu)
- return 0;
-
- if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
-
- if (x86_perf_get_lbr(&lbr) >= 0 && lbr.nr)
- perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
-
- if (vmx_pebs_supported()) {
- perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
- }
-
- return perf_cap;
-}
-
static inline bool cpu_has_notify_vmexit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/hyperv.c
index d8b23c96d627..79450e1ed7cf 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -1,18 +1,122 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/errno.h>
#include <linux/smp.h>
-#include "../hyperv.h"
#include "../cpuid.h"
-#include "evmcs.h"
+#include "hyperv.h"
+#include "nested.h"
#include "vmcs.h"
#include "vmx.h"
#include "trace.h"
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
-DEFINE_STATIC_KEY_FALSE(enable_evmcs);
+/*
+ * Enlightened VMCSv1 doesn't support these:
+ *
+ * POSTED_INTR_NV = 0x00000002,
+ * GUEST_INTR_STATUS = 0x00000810,
+ * APIC_ACCESS_ADDR = 0x00002014,
+ * POSTED_INTR_DESC_ADDR = 0x00002016,
+ * EOI_EXIT_BITMAP0 = 0x0000201c,
+ * EOI_EXIT_BITMAP1 = 0x0000201e,
+ * EOI_EXIT_BITMAP2 = 0x00002020,
+ * EOI_EXIT_BITMAP3 = 0x00002022,
+ * GUEST_PML_INDEX = 0x00000812,
+ * PML_ADDRESS = 0x0000200e,
+ * VM_FUNCTION_CONTROL = 0x00002018,
+ * EPTP_LIST_ADDRESS = 0x00002024,
+ * VMREAD_BITMAP = 0x00002026,
+ * VMWRITE_BITMAP = 0x00002028,
+ *
+ * TSC_MULTIPLIER = 0x00002032,
+ * PLE_GAP = 0x00004020,
+ * PLE_WINDOW = 0x00004022,
+ * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ *
+ * Currently unsupported in KVM:
+ * GUEST_IA32_RTIT_CTL = 0x00002814,
+ */
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
@@ -322,24 +426,17 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu)
{
- struct hv_vp_assist_page assist_page;
-
- *evmcs_gpa = -1ull;
-
- if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
- return false;
-
- if (unlikely(!assist_page.enlighten_vmentry))
- return false;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- if (unlikely(!evmptr_is_valid(assist_page.current_nested_vmcs)))
- return false;
+ if (unlikely(kvm_hv_get_assist_page(vcpu)))
+ return EVMPTR_INVALID;
- *evmcs_gpa = assist_page.current_nested_vmcs;
+ if (unlikely(!hv_vcpu->vp_assist_page.enlighten_vmentry))
+ return EVMPTR_INVALID;
- return true;
+ return hv_vcpu->vp_assist_page.current_nested_vmcs;
}
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
@@ -368,35 +465,43 @@ enum evmcs_revision {
enum evmcs_ctrl_type {
EVMCS_EXIT_CTRLS,
EVMCS_ENTRY_CTRLS,
+ EVMCS_EXEC_CTRL,
EVMCS_2NDEXEC,
+ EVMCS_3RDEXEC,
EVMCS_PINCTRL,
EVMCS_VMFUNC,
NR_EVMCS_CTRLS,
};
-static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
[EVMCS_EXIT_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL,
},
[EVMCS_ENTRY_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_EXEC_CTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL,
},
[EVMCS_2NDEXEC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING,
+ },
+ [EVMCS_3RDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC,
},
[EVMCS_PINCTRL] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL,
},
[EVMCS_VMFUNC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC,
},
};
-static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type)
+static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type)
{
enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
- return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev];
+ return evmcs_supported_ctrls[ctrl_type][evmcs_rev];
}
static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
@@ -420,7 +525,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
{
u32 ctl_low = (u32)*pdata;
u32 ctl_high = (u32)(*pdata >> 32);
- u32 unsupported_ctrls;
+ u32 supported_ctrls;
/*
* Hyper-V 2016 and 2019 try using these features even when eVMCS
@@ -429,27 +534,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL);
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC);
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL);
break;
case MSR_IA32_VMX_VMFUNC:
- ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC);
+ ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC);
break;
}
@@ -459,7 +568,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
u32 val)
{
- return !(val & evmcs_get_unsupported_ctls(ctrl_type));
+ return !(val & ~evmcs_get_supported_ctls(ctrl_type));
}
int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
@@ -468,6 +577,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
vmcs12->pin_based_vm_exec_control)))
return -EINVAL;
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL,
+ vmcs12->cpu_based_vm_exec_control)))
+ return -EINVAL;
+
if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
vmcs12->secondary_vm_exec_control)))
return -EINVAL;
@@ -495,6 +608,40 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
return 0;
}
+#if IS_ENABLED(CONFIG_HYPERV)
+DEFINE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
+#endif
+
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
@@ -507,3 +654,23 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return 0;
}
+
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+
+ if (!hv_vcpu || !evmcs)
+ return false;
+
+ if (!evmcs->hv_enlightenments_control.nested_flush_hypercall)
+ return false;
+
+ return hv_vcpu->vp_assist_page.nested_control.features.directhypercall;
+}
+
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
+{
+ nested_vmx_vmexit(vcpu, HV_VMX_SYNTHETIC_EXIT_REASON_TRAP_AFTER_FLUSH, 0, 0);
+}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/hyperv.h
index 6f746ef3c038..9623fe1651c4 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __KVM_X86_VMX_EVMCS_H
-#define __KVM_X86_VMX_EVMCS_H
+#ifndef __KVM_X86_VMX_HYPERV_H
+#define __KVM_X86_VMX_HYPERV_H
#include <linux/jump_label.h>
@@ -8,61 +8,18 @@
#include <asm/mshyperv.h>
#include <asm/vmx.h>
+#include "../hyperv.h"
+
#include "capabilities.h"
#include "vmcs.h"
#include "vmcs12.h"
struct vmcs_config;
-DECLARE_STATIC_KEY_FALSE(enable_evmcs);
-
#define current_evmcs ((struct hv_enlightened_vmcs *)this_cpu_read(current_vmcs))
#define KVM_EVMCS_VERSION 1
-/*
- * Enlightened VMCSv1 doesn't support these:
- *
- * POSTED_INTR_NV = 0x00000002,
- * GUEST_INTR_STATUS = 0x00000810,
- * APIC_ACCESS_ADDR = 0x00002014,
- * POSTED_INTR_DESC_ADDR = 0x00002016,
- * EOI_EXIT_BITMAP0 = 0x0000201c,
- * EOI_EXIT_BITMAP1 = 0x0000201e,
- * EOI_EXIT_BITMAP2 = 0x00002020,
- * EOI_EXIT_BITMAP3 = 0x00002022,
- * GUEST_PML_INDEX = 0x00000812,
- * PML_ADDRESS = 0x0000200e,
- * VM_FUNCTION_CONTROL = 0x00002018,
- * EPTP_LIST_ADDRESS = 0x00002024,
- * VMREAD_BITMAP = 0x00002026,
- * VMWRITE_BITMAP = 0x00002028,
- *
- * TSC_MULTIPLIER = 0x00002032,
- * PLE_GAP = 0x00004020,
- * PLE_WINDOW = 0x00004022,
- * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
- *
- * Currently unsupported in KVM:
- * GUEST_IA32_RTIT_CTL = 0x00002814,
- */
-#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
- PIN_BASED_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
-#define EVMCS1_UNSUPPORTED_2NDEXEC \
- (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
- SECONDARY_EXEC_APIC_REGISTER_VIRT | \
- SECONDARY_EXEC_ENABLE_PML | \
- SECONDARY_EXEC_ENABLE_VMFUNC | \
- SECONDARY_EXEC_SHADOW_VMCS | \
- SECONDARY_EXEC_TSC_SCALING | \
- SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
-#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -110,14 +67,19 @@ static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
#if IS_ENABLED(CONFIG_HYPERV)
+DECLARE_STATIC_KEY_FALSE(__kvm_is_using_evmcs);
+
+static __always_inline bool kvm_is_using_evmcs(void)
+{
+ return static_branch_unlikely(&__kvm_is_using_evmcs);
+}
+
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
{
int offset = evmcs_field_offset(field, clean_field);
- WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
-
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
return offset;
}
@@ -134,7 +96,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write32(unsigned long field, u32 value)
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -146,7 +108,7 @@ static inline void evmcs_write32(unsigned long field, u32 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write16(unsigned long field, u16 value)
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -158,7 +120,7 @@ static inline void evmcs_write16(unsigned long field, u16 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline u64 evmcs_read64(unsigned long field)
+static __always_inline u64 evmcs_read64(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -168,7 +130,7 @@ static inline u64 evmcs_read64(unsigned long field)
return *(u64 *)((char *)current_evmcs + offset);
}
-static inline u32 evmcs_read32(unsigned long field)
+static __always_inline u32 evmcs_read32(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -178,7 +140,7 @@ static inline u32 evmcs_read32(unsigned long field)
return *(u32 *)((char *)current_evmcs + offset);
}
-static inline u16 evmcs_read16(unsigned long field)
+static __always_inline u16 evmcs_read16(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -188,16 +150,6 @@ static inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -209,15 +161,16 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
+static __always_inline bool kvm_is_using_evmcs(void) { return false; }
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
-static inline void evmcs_write32(unsigned long field, u32 value) {}
-static inline void evmcs_write16(unsigned long field, u16 value) {}
-static inline u64 evmcs_read64(unsigned long field) { return 0; }
-static inline u32 evmcs_read32(unsigned long field) { return 0; }
-static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
@@ -235,11 +188,13 @@ enum nested_evmptrld_status {
EVMPTRLD_ERROR,
};
-bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
+u64 nested_get_evmptr(struct kvm_vcpu *vcpu);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
+bool nested_evmcs_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu);
+void vmx_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu);
-#endif /* __KVM_X86_VMX_EVMCS_H */
+#endif /* __KVM_X86_VMX_HYPERV_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 8b15a00ca8e6..742bb2d87e39 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/objtool.h>
#include <linux/percpu.h>
@@ -7,7 +8,6 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -16,6 +16,7 @@
#include "trace.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -225,6 +226,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
@@ -233,6 +235,12 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
}
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ if (hv_vcpu) {
+ hv_vcpu->nested.pa_page_gpa = INVALID_GPA;
+ hv_vcpu->nested.vm_id = 0;
+ hv_vcpu->nested.vp_id = 0;
+ }
}
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
@@ -350,6 +358,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
gpa_t addr)
{
+ unsigned long roots = 0;
uint i;
struct kvm_mmu_root_info *cached_root;
@@ -360,8 +369,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd,
eptp))
- vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa);
+ roots |= KVM_MMU_ROOT_PREVIOUS(i);
}
+ if (roots)
+ kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
}
static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -646,6 +657,9 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_PRED_CMD, MSR_TYPE_W);
+ nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
+
kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -1126,6 +1140,15 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
+ * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or
+ * L2's VP_ID upon request from the guest. Make sure we check for
+ * pending entries in the right FIFO upon L1/L2 transition as these
+ * requests are put by other vCPUs asynchronously.
+ */
+ if (to_hv_vcpu(vcpu) && enable_ept)
+ kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu);
+
+ /*
* If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
* for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
* full TLB flush from the guest's perspective. This is required even
@@ -1563,12 +1586,20 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
{
struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu);
/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
vmcs12->tpr_threshold = evmcs->tpr_threshold;
vmcs12->guest_rip = evmcs->guest_rip;
if (unlikely(!(hv_clean_fields &
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) {
+ hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page;
+ hv_vcpu->nested.vm_id = evmcs->hv_vm_id;
+ hv_vcpu->nested.vp_id = evmcs->hv_vp_id;
+ }
+
+ if (unlikely(!(hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
vmcs12->guest_rsp = evmcs->guest_rsp;
vmcs12->guest_rflags = evmcs->guest_rflags;
@@ -1983,7 +2014,8 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
- if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
+ evmcs_gpa = nested_get_evmptr(vcpu);
+ if (!evmptr_is_valid(evmcs_gpa)) {
nested_release_evmcs(vcpu);
return EVMPTRLD_DISABLED;
}
@@ -2569,12 +2601,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
nested_ept_init_mmu_context(vcpu);
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
- * bits which we consider mandatory enabled.
- * The CR0_READ_SHADOW is what L2 should have expected to read given
- * the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
- * have more bits than L1 expected.
+ * Override the CR0/CR4 read shadows after setting the effective guest
+ * CR0/CR4. The common helpers also set the shadows, but they don't
+ * account for vmcs12's cr0/4_guest_host_mask.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
@@ -2886,7 +2915,7 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- bool ia32e;
+ bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
@@ -2906,12 +2935,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
vmcs12->host_ia32_perf_global_ctrl)))
return -EINVAL;
-#ifdef CONFIG_X86_64
- ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
-#else
- ia32e = false;
-#endif
-
if (ia32e) {
if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
@@ -3005,7 +3028,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12,
enum vm_entry_failure_code *entry_failure_code)
{
- bool ia32e;
+ bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE);
*entry_failure_code = ENTRY_FAIL_DEFAULT;
@@ -3031,6 +3054,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
vmcs12->guest_ia32_perf_global_ctrl)))
return -EINVAL;
+ if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG))
+ return -EINVAL;
+
+ if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) ||
+ CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG)))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-entry control is 1, the following checks
* are performed on the field for the IA32_EFER MSR:
@@ -3042,7 +3072,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
*/
if (to_vmx(vcpu)->nested.nested_run_pending &&
(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
- ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
@@ -3257,6 +3286,12 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{
+ /*
+ * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy
+ * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory
+ * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post
+ * migration.
+ */
if (!nested_get_evmcs_page(vcpu)) {
pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
__func__);
@@ -3845,7 +3880,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
exit_qual = 0;
}
- if (ex->has_error_code) {
+ /*
+ * Unlike AMD's Paged Real Mode, which reports an error code on #PF
+ * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the
+ * "has error code" flags on VM-Exit if the CPU is in Real Mode.
+ */
+ if (ex->has_error_code && is_protmode(vcpu)) {
/*
* Intel CPUs do not generate error codes with bits 31:16 set,
* and more importantly VMX disallows setting bits 31:16 in the
@@ -4455,7 +4495,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
* (KVM doesn't change it);
*/
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
@@ -4606,7 +4646,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
- vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
@@ -4773,6 +4813,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+ /*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect
+ * branch predictors when transitioning from L2 to L1, as L1 expects
+ * hardware (KVM in this case) to provide separate predictor modes.
+ * Bare metal isolates VMX root (host) from VMX non-root (guest), but
+ * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
+ * separate modes for L2 vs L1.
+ */
+ if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ indirect_branch_prediction_barrier();
+
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -4860,6 +4911,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
{
+ kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
}
@@ -5105,24 +5157,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
- * that have higher priority than VM-Exit (see Intel SDM's pseudocode
- * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
- * running the guest, i.e. KVM needs to check the _guest_ values.
+ * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter
+ * the guest and so cannot rely on hardware to perform the check,
+ * which has higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON).
*
- * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
- * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
- * Mode, but KVM will never take the guest out of those modes.
+ * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86
+ * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't
+ * force any of the relevant guest state. For a restricted guest, KVM
+ * does force CR0.PE=1, but only to also force VM86 in order to emulate
+ * Real Mode, and so there's no need to check CR0.PE manually.
*/
- if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
- !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
/*
- * CPL=0 and all other checks that are lower priority than VM-Exit must
- * be checked manually.
+ * The CPL is checked for "not in VMX operation" and for "in VMX root",
+ * and has higher priority than the VM-Fail due to being post-VMXON,
+ * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root,
+ * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits
+ * from L2 to L1, i.e. there's no need to check for the vCPU being in
+ * VMX non-root.
+ *
+ * Forwarding the VM-Exit unconditionally, i.e. without performing the
+ * #UD checks (see above), is functionally ok because KVM doesn't allow
+ * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's
+ * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are
+ * missed by hardware due to shadowing CR0 and/or CR4.
*/
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
@@ -5132,6 +5195,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon)
return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ /*
+ * Invalid CR0/CR4 generates #GP. These checks are performed if and
+ * only if the vCPU isn't already in VMX operation, i.e. effectively
+ * have lower priority than the VM-Fail above.
+ */
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
@@ -5211,7 +5285,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 zero = 0;
gpa_t vmptr;
- u64 evmcs_gpa;
int r;
if (!nested_vmx_check_permission(vcpu))
@@ -5237,14 +5310,23 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
if (likely(!guest_cpuid_has_evmcs(vcpu) ||
- !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
+ !evmptr_is_valid(nested_get_evmptr(vcpu)))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
- kvm_vcpu_write_guest(vcpu,
- vmptr + offsetof(struct vmcs12,
- launch_state),
- &zero, sizeof(zero));
+ /*
+ * Silently ignore memory errors on VMCLEAR, Intel's pseudocode
+ * for VMCLEAR includes a "ensure that data for VMCS referenced
+ * by the operand is in memory" clause that guards writes to
+ * memory, i.e. doing nothing for I/O is architecturally valid.
+ *
+ * FIXME: Suppress failures if and only if no memslot is found,
+ * i.e. exit to userspace if __copy_to_user() fails.
+ */
+ (void)kvm_vcpu_write_guest(vcpu,
+ vmptr + offsetof(struct vmcs12,
+ launch_state),
+ &zero, sizeof(zero));
} else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) {
nested_release_evmcs(vcpu);
}
@@ -5799,11 +5881,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
u32 function = kvm_rax_read(vcpu);
/*
- * VMFUNC is only supported for nested guests, but we always enable the
- * secondary control for simplicity; for non-nested mode, fake that we
- * didn't by injecting #UD.
+ * VMFUNC should never execute cleanly while L1 is active; KVM supports
+ * VMFUNC for nested VMs, but not for L1.
*/
- if (!is_guest_mode(vcpu)) {
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6134,6 +6215,11 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
* Handle L2's bus locks in L0 directly.
*/
return true;
+ case EXIT_REASON_VMCALL:
+ /* Hyper-V L2 TLB flush hypercall is handled by L0 */
+ return guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
+ nested_evmcs_l2_tlb_flush_enabled(vcpu) &&
+ kvm_hv_is_tlb_flush_hcall(vcpu);
default:
break;
}
@@ -6446,9 +6532,6 @@ out:
return kvm_state.size;
}
-/*
- * Forcibly leave nested mode in order to be able to reset the VCPU later on.
- */
void vmx_leave_nested(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -6684,36 +6767,9 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT;
}
-/*
- * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
- * returned for the various VMX controls MSRs when nested VMX is enabled.
- * The same values should also be used to verify that vmcs12 control fields are
- * valid during nested entry from L1 to L2.
- * Each of these control msrs has a low and high 32-bit half: A low bit is on
- * if the corresponding bit in the (32-bit) control field *must* be on, and a
- * bit in the high half is on if the corresponding bit in the control field
- * may be on. See also vmx_control_verify().
- */
-void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
{
- struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
-
- /*
- * Note that as a general rule, the high half of the MSRs (bits in
- * the control fields which may be 1) should be initialized by the
- * intersection of the underlying hardware's MSR (i.e., features which
- * can be supported) and the list of features we want to expose -
- * because they are known to be properly supported in our code.
- * Also, usually, the low half of the MSRs (bits which must be 1) can
- * be set to 0, meaning that L1 may turn off any of these bits. The
- * reason is that if one of these bits is necessary, it will appear
- * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
- * fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_l1_wants_exit() will not pass related exits to L1.
- * These rules have exceptions below.
- */
-
- /* pin-based controls */
msrs->pinbased_ctls_low =
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6726,8 +6782,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+}
- /* exit controls */
+static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6746,8 +6805,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+}
- /* entry controls */
+static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6763,8 +6825,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+}
- /* cpu-based controls */
+static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -6796,12 +6861,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
/* We support free control of CR3 access interception. */
msrs->procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
+}
- /*
- * secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by
- * vmx_vcpu_after_set_cpuid.
- */
+static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
+ struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
@@ -6814,9 +6879,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_TSC_SCALING;
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
/*
* We can emulate "VMCS shadowing," even if the hardware
@@ -6845,18 +6912,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_PML;
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
- }
- if (cpu_has_vmx_vmfunc()) {
- msrs->secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_VMFUNC;
/*
- * Advertise EPTP switching unconditionally
- * since we emulate it
+ * Advertise EPTP switching irrespective of hardware support,
+ * KVM emulates it in software so long as VMFUNC is supported.
*/
- if (enable_ept)
- msrs->vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (cpu_has_vmx_vmfunc())
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -6882,8 +6944,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (enable_sgx)
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
+}
- /* miscellaneous data */
+static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
+ struct nested_vmx_msrs *msrs)
+{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
@@ -6891,7 +6956,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
msrs->misc_high = 0;
+}
+static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
+{
/*
* This MSR reports some information about VMX support. We
* should return information about the VMX we emulate for the
@@ -6906,7 +6974,10 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
+}
+static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
+{
/*
* These MSRs specify bits which the guest must keep fixed on
* while L1 is in VMXON mode (in L1's root mode, or running an L2).
@@ -6923,6 +6994,51 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
if (vmx_umip_emulated())
msrs->cr4_fixed1 |= X86_CR4_UMIP;
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ */
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
+{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
+ /*
+ * Note that as a general rule, the high half of the MSRs (bits in
+ * the control fields which may be 1) should be initialized by the
+ * intersection of the underlying hardware's MSR (i.e., features which
+ * can be supported) and the list of features we want to expose -
+ * because they are known to be properly supported in our code.
+ * Also, usually, the low half of the MSRs (bits which must be 1) can
+ * be set to 0, meaning that L1 may turn off any of these bits. The
+ * reason is that if one of these bits is necessary, it will appear
+ * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+ * fields of vmcs01 and vmcs02, will turn these bits off - and
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
+ * These rules have exceptions below.
+ */
+ nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_exit_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_entry_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs);
+
+ nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs);
+
+ nested_vmx_setup_misc_data(vmcs_conf, msrs);
+
+ nested_vmx_setup_basic(msrs);
+
+ nested_vmx_setup_cr_fixed(msrs);
msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr();
}
@@ -6988,4 +7104,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = {
.write_log_dirty = nested_vmx_write_pml_buffer,
.enable_evmcs = nested_enable_evmcs,
.get_evmcs_version = nested_get_evmcs_version,
+ .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush,
};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6312c9541c3c..96952263b029 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -79,9 +79,10 @@ static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
}
/*
- * Return the cr0 value that a nested guest would read. This is a combination
- * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
- * its hypervisor (cr0_read_shadow).
+ * Return the cr0/4 value that a nested guest would read. This is a combination
+ * of L1's "real" cr0 used to run the guest (guest_cr0), and the bits shadowed
+ * by the L1 hypervisor (cr0_read_shadow). KVM must emulate CPU behavior as
+ * the value+mask loaded into vmcs02 may not match the vmcs12 fields.
*/
static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
{
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 10b33da9bd05..741efe2c497b 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -8,6 +8,8 @@
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -20,16 +22,19 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+static struct {
+ u8 eventsel;
+ u8 unit_mask;
+} const intel_arch_events[] = {
+ [0] = { 0x3c, 0x00 },
+ [1] = { 0xc0, 0x00 },
+ [2] = { 0x3c, 0x01 },
+ [3] = { 0x2e, 0x4f },
+ [4] = { 0x2e, 0x41 },
+ [5] = { 0xc4, 0x00 },
+ [6] = { 0xc5, 0x00 },
/* The above index must match CPUID 0x0A.EBX bit vector */
- [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03 },
};
/* mapping between fixed pmc index and intel_arch_events array */
@@ -52,7 +57,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
__set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
- reprogram_counter(pmc);
+ kvm_pmu_request_counter_reprogram(pmc);
}
}
@@ -71,13 +76,13 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
static void reprogram_counters(struct kvm_pmu *pmu, u64 diff)
{
int bit;
- struct kvm_pmc *pmc;
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
- if (pmc)
- reprogram_counter(pmc);
- }
+ if (!diff)
+ return;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ set_bit(bit, pmu->reprogram_pmi);
+ kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu));
}
static bool intel_hw_event_available(struct kvm_pmc *pmc)
@@ -346,45 +351,47 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
msr_info->data = pmu->fixed_ctr_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_STATUS:
msr_info->data = pmu->global_status;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
msr_info->data = pmu->global_ctrl;
- return 0;
+ break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
msr_info->data = 0;
- return 0;
+ break;
case MSR_IA32_PEBS_ENABLE:
msr_info->data = pmu->pebs_enable;
- return 0;
+ break;
case MSR_IA32_DS_AREA:
msr_info->data = pmu->ds_area;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
msr_info->data = pmu->pebs_data_cfg;
- return 0;
+ break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
(pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_GP];
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
u64 val = pmc_read_counter(pmc);
msr_info->data =
val & pmu->counter_bitmask[KVM_PMC_FIXED];
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
msr_info->data = pmc->eventsel;
- return 0;
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, true)) {
+ break;
+ }
+ return 1;
}
- return 1;
+ return 0;
}
static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -397,44 +404,43 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & pmu->fixed_ctr_ctrl_mask)) {
+ if (data & pmu->fixed_ctr_ctrl_mask)
+ return 1;
+
+ if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
- return 0;
- }
break;
case MSR_CORE_PERF_GLOBAL_STATUS:
- if (msr_info->host_initiated) {
- pmu->global_status = data;
- return 0;
- }
- break; /* RO MSR */
+ if (!msr_info->host_initiated)
+ return 1; /* RO MSR */
+
+ pmu->global_status = data;
+ break;
case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (kvm_valid_perf_global_ctrl(pmu, data)) {
+ if (!kvm_valid_perf_global_ctrl(pmu, data))
+ return 1;
+
+ if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & pmu->global_ovf_ctrl_mask)) {
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- return 0;
- }
+ if (data & pmu->global_ovf_ctrl_mask)
+ return 1;
+
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
break;
case MSR_IA32_PEBS_ENABLE:
- if (pmu->pebs_enable == data)
- return 0;
- if (!(data & pmu->pebs_enable_mask)) {
+ if (data & pmu->pebs_enable_mask)
+ return 1;
+
+ if (pmu->pebs_enable != data) {
diff = pmu->pebs_enable ^ data;
pmu->pebs_enable = data;
reprogram_counters(pmu, diff);
- return 0;
}
break;
case MSR_IA32_DS_AREA:
@@ -442,15 +448,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
+
pmu->ds_area = data;
- return 0;
+ break;
case MSR_PEBS_DATA_CFG:
- if (pmu->pebs_data_cfg == data)
- return 0;
- if (!(data & pmu->pebs_data_cfg_mask)) {
- pmu->pebs_data_cfg = data;
- return 0;
- }
+ if (data & pmu->pebs_data_cfg_mask)
+ return 1;
+
+ pmu->pebs_data_cfg = data;
break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -458,33 +463,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
(data & ~pmu->counter_bitmask[KVM_PMC_GP]))
return 1;
+
if (!msr_info->host_initiated &&
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
pmc_update_sample_period(pmc);
- return 0;
+ break;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
reserved_bits = pmu->reserved_bits;
if ((pmc->idx == 2) &&
(pmu->raw_event_mask & HSW_IN_TX_CHECKPOINTED))
reserved_bits ^= HSW_IN_TX_CHECKPOINTED;
- if (!(data & reserved_bits)) {
+ if (data & reserved_bits)
+ return 1;
+
+ if (data != pmc->eventsel) {
pmc->eventsel = data;
- reprogram_counter(pmc);
- return 0;
+ kvm_pmu_request_counter_reprogram(pmc);
}
- } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false))
- return 0;
+ break;
+ } else if (intel_pmu_handle_lbr_msrs_access(vcpu, msr_info, false)) {
+ break;
+ }
+ /* Not a known PMU MSR. */
+ return 1;
}
- return 1;
+ return 0;
}
static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
@@ -526,6 +536,16 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->pebs_enable_mask = ~0ull;
pmu->pebs_data_cfg_mask = ~0ull;
+ memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
+
+ /*
+ * Setting passthrough of LBR MSRs is done only in the VM-Entry loop,
+ * and PMU refresh is disallowed after the vCPU has run, i.e. this code
+ * should never be reached while KVM is passing through MSRs.
+ */
+ if (KVM_BUG_ON(lbr_desc->msr_passthrough, vcpu->kvm))
+ return;
+
entry = kvm_find_cpuid_entry(vcpu, 0xa);
if (!entry || !vcpu->kvm->arch.enable_pmu)
return;
@@ -631,7 +651,6 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
pmu->fixed_counters[i].current_config = 0;
}
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
lbr_desc->records.nr = 0;
lbr_desc->event = NULL;
lbr_desc->msr_passthrough = false;
@@ -647,14 +666,14 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc = &pmu->gp_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
+ pmc->counter = pmc->prev_counter = pmc->eventsel = 0;
}
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmc = &pmu->fixed_counters[i];
pmc_stop_counter(pmc);
- pmc->counter = 0;
+ pmc->counter = pmc->prev_counter = 0;
}
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
@@ -763,8 +782,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
return;
warn:
- pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
- vcpu->vcpu_id);
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
}
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
@@ -811,4 +829,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.reset = intel_pmu_reset,
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
+ .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..94c38bea60e7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <asm/irq_remapping.h>
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 8f95c7c01433..2261b684a7d4 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/sgx.h>
@@ -28,14 +29,14 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
/* Skip vmcs.GUEST_DS retrieval for 64-bit mode to avoid VMREADs. */
*gva = offset;
- if (!is_long_mode(vcpu)) {
+ if (!is_64_bit_mode(vcpu)) {
vmx_get_segment(vcpu, &s, VCPU_SREG_DS);
*gva += s.base;
}
if (!IS_ALIGNED(*gva, alignment)) {
fault = true;
- } else if (likely(is_long_mode(vcpu))) {
+ } else if (likely(is_64_bit_mode(vcpu))) {
fault = is_noncanonical_address(*gva, vcpu);
} else {
*gva &= 0xffffffff;
@@ -164,17 +165,24 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
(attributes & SGX_ATTR_PROVISIONKEY)) {
if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
- pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
kvm_inject_gp(vcpu, 0);
return 1;
}
- /* Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. */
+ /*
+ * Enforce CPUID restrictions on MISCSELECT, ATTRIBUTES and XFRM. Note
+ * that the allowed XFRM (XFeature Request Mask) isn't strictly bound
+ * by the supported XCR0. FP+SSE *must* be set in XFRM, even if XSAVE
+ * is unsupported, i.e. even if XCR0 itself is completely unsupported.
+ */
if ((u32)miscselect & ~sgx_12_0->ebx ||
(u32)attributes & ~sgx_12_1->eax ||
(u32)(attributes >> 32) & ~sgx_12_1->ebx ||
(u32)xfrm & ~sgx_12_1->ecx ||
- (u32)(xfrm >> 32) & ~sgx_12_1->edx) {
+ (u32)(xfrm >> 32) & ~sgx_12_1->edx ||
+ xfrm & ~(vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE) ||
+ (xfrm & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -182,8 +190,10 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
/* Enforce CPUID restriction on max enclave size. */
max_size_log2 = (attributes & SGX_ATTR_MODE64BIT) ? sgx_12_0->edx >> 8 :
sgx_12_0->edx;
- if (size >= BIT_ULL(max_size_log2))
+ if (size >= BIT_ULL(max_size_log2)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
/*
* sgx_virt_ecreate() returns:
@@ -379,7 +389,7 @@ int handle_encls(struct kvm_vcpu *vcpu)
return handle_encls_ecreate(vcpu);
if (leaf == EINIT)
return handle_encls_einit(vcpu);
- WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index ac290a44a693..7c1996b433e2 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -75,7 +75,7 @@ struct loaded_vmcs {
struct vmcs_controls_shadow controls_shadow;
};
-static inline bool is_intr_type(u32 intr_info, u32 type)
+static __always_inline bool is_intr_type(u32 intr_info, u32 type)
{
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
@@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info)
return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}
-static inline bool is_nmi(u32 intr_info)
+static __always_inline bool is_nmi(u32 intr_info)
{
return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 2251b60920f8..106a72c923ca 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 746129ddd5ae..01936013428b 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -208,9 +208,8 @@ struct __packed vmcs12 {
/*
* For save/restore compatibility, the vmcs12 field offsets must not change.
*/
-#define CHECK_OFFSET(field, loc) \
- BUILD_BUG_ON_MSG(offsetof(struct vmcs12, field) != (loc), \
- "Offset of " #field " in struct vmcs12 has changed.")
+#define CHECK_OFFSET(field, loc) \
+ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
static inline void vmx_check_vmcs12_offsets(void)
{
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 0b5db4de4d09..631fd7da2bc3 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,6 +31,39 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
+.macro VMX_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ RET
+.endm
+
.section .noinstr.text, "ax"
/**
@@ -69,8 +102,8 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2
- /* Copy @flags to BL, _ASM_ARG3 is volatile. */
- mov %_ASM_ARG3B, %bl
+ /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3L, %ebx
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
@@ -106,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- testb $VMX_RUN_VMRESUME, %bl
+ test $VMX_RUN_VMRESUME, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -128,7 +161,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Check EFLAGS.ZF from 'testb' above */
+ /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
jz .Lvmlaunch
/*
@@ -229,7 +262,7 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
* eIBRS has its own protection against poisoned RSB, so it doesn't
* need the RSB filling sequence. But it does need to be enabled, and a
* single call to retire, before the first unbalanced RET.
- */
+ */
FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
X86_FEATURE_RSB_VMEXIT_LITE
@@ -266,14 +299,19 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
SYM_FUNC_END(__vmx_vcpu_run)
+SYM_FUNC_START(vmx_do_nmi_irqoff)
+ VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(vmx_do_nmi_irqoff)
+
.section .text, "ax"
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
* @field: VMCS field encoding that failed
* @fault: %true if the VMREAD faulted, %false if it failed
-
+ *
* Save and restore volatile registers across a call to vmread_error(). Note,
* all parameters are passed on the stack.
*/
@@ -317,36 +355,8 @@ SYM_FUNC_START(vmread_error_trampoline)
RET
SYM_FUNC_END(vmread_error_trampoline)
-
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
- /*
- * Unconditionally create a stack frame, getting the correct RSP on the
- * stack (for x86-64) would take two instructions anyways, and RBP can
- * be used to restore RSP to make objtool happy (see below).
- */
- push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
- /*
- * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
- * creating the synthetic interrupt stack frame for the IRQ/NMI.
- */
- and $-16, %rsp
- push $__KERNEL_DS
- push %rbp
#endif
- pushf
- push $__KERNEL_CS
- CALL_NOSPEC _ASM_ARG1
- /*
- * "Restore" RSP from RBP, even though IRET has already unwound RSP to
- * the correct value. objtool doesn't know the callee will IRET and,
- * without the explicit restore, thinks the stack is getting walloped.
- * Using an unwind hint is problematic due to x86-64's dynamic alignment.
- */
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
- RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
+ VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 63247c57c72c..44fb619803b8 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -12,6 +12,7 @@
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
@@ -51,7 +52,6 @@
#include "capabilities.h"
#include "cpuid.h"
-#include "evmcs.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
#include "irq.h"
@@ -66,6 +66,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "smm.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -163,6 +164,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
MSR_IA32_PRED_CMD,
+ MSR_IA32_FLUSH_CMD,
MSR_IA32_TSC,
#ifdef CONFIG_X86_64
MSR_FS_BASE,
@@ -444,36 +446,36 @@ void vmread_error(unsigned long field, bool fault)
if (fault)
kvm_spurious_fault();
else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
@@ -488,8 +490,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -523,10 +525,12 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
-static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
{
struct hv_enlightened_vmcs *evmcs;
struct hv_partition_assist_pg **p_hv_pa_pg =
@@ -551,6 +555,71 @@ static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&__kvm_is_using_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!kvm_is_using_evmcs())
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -806,7 +875,7 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
- else {
+ else {
int mask = 0, match = 0;
if (enable_ept && (eb & (1u << PF_VECTOR))) {
@@ -858,7 +927,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
* to change it directly without causing a vmexit. In that case read
* it after vmexit and store it in vmx->spec_ctrl.
*/
- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+ if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
flags |= VMX_RUN_SAVE_SPEC_CTRL;
return flags;
@@ -1214,7 +1283,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
}
}
- if (vmx->nested.need_vmcs12_to_shadow_sync)
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
if (vmx->guest_state_loaded)
@@ -1348,8 +1417,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
/*
* No indirect branch prediction barrier needed when switching
- * the active VMCS within a guest, e.g. on nested VM-Enter.
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ * the active VMCS within a vCPU, unless IBRS is advertised to
+ * the vCPU. To minimize the number of IBPBs executed, KVM
+ * performs IBPB on nested VM-Exit (a single nested transition
+ * may switch the active VMCS multiple times).
*/
if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
indirect_branch_prediction_barrier();
@@ -1611,8 +1682,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
@@ -1834,24 +1905,51 @@ bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
-static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
- uint64_t val)
+/*
+ * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
+ * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
+ * backwards compatibility even though KVM doesn't support emulating SMX. And
+ * because userspace set "VMX in SMX", the guest must also be allowed to set it,
+ * e.g. if the MSR is left unlocked and the guest does a RMW operation.
+ */
+#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \
+ FEAT_CTL_SGX_LC_ENABLED | \
+ FEAT_CTL_SGX_ENABLED | \
+ FEAT_CTL_LMCE_ENABLED)
+
+static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
+ struct msr_data *msr)
{
- uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+ uint64_t valid_bits;
- return !(val & ~valid_bits);
+ /*
+ * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
+ * exposed to the guest.
+ */
+ WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
+ ~KVM_SUPPORTED_FEATURE_CONTROL);
+
+ if (!msr->host_initiated &&
+ (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
+ return false;
+
+ if (msr->host_initiated)
+ valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
+ else
+ valid_bits = vmx->msr_ia32_feature_control_valid_bits;
+
+ return !(msr->data & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
- case MSR_IA32_PERF_CAPABILITIES:
- msr->data = vmx_get_perf_capabilities();
- return 0;
default:
return KVM_MSR_RET_INVALID;
}
@@ -1933,7 +2031,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested_vmx_allowed(vcpu))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
@@ -2029,7 +2127,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
(host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- if ((vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT) &&
+ if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
(host_initiated || intel_pmu_lbr_is_enabled(vcpu)))
debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
@@ -2109,9 +2207,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
}
@@ -2190,33 +2286,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
return 1;
goto find_uret_msr;
- case MSR_IA32_PRED_CMD:
- if (!msr_info->host_initiated &&
- !guest_has_pred_cmd_msr(vcpu))
- return 1;
-
- if (data & ~PRED_CMD_IBPB)
- return 1;
- if (!boot_cpu_has(X86_FEATURE_IBPB))
- return 1;
- if (!data)
- break;
-
- wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
-
- /*
- * For non-nested:
- * When it's written (to non-zero) for the first time, pass
- * it through.
- *
- * For nested:
- * The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_prepare_msr_bitmap. We should not touch the
- * vmcs02.msr_bitmap here since it gets completely overwritten
- * in the merging.
- */
- vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
- break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
return 1;
@@ -2241,10 +2310,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEAT_CTL:
- if (!vmx_feature_control_msr_valid(vcpu, data) ||
- (to_vmx(vcpu)->msr_ia32_feature_control &
- FEAT_CTL_LOCKED && !msr_info->host_initiated))
+ if (!is_vmx_feature_control_msr_valid(vmx, msr_info))
return 1;
+
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
@@ -2272,7 +2340,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->msr_ia32_sgxlepubkeyhash
[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
break;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
if (!nested_vmx_allowed(vcpu))
@@ -2342,14 +2410,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
- (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT))
+ (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
}
if (data & PERF_CAP_PEBS_FORMAT) {
if ((data & PERF_CAP_PEBS_MASK) !=
- (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK))
+ (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
return 1;
@@ -2420,88 +2488,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int vmx_hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void vmx_hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
@@ -2537,8 +2523,7 @@ static bool cpu_has_perf_global_ctrl_bug(void)
return false;
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2556,7 +2541,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
@@ -2565,8 +2550,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
return ctl_opt & allowed;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
@@ -2724,9 +2709,126 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmentry_ctrl = _vmentry_control;
vmcs_conf->misc = misc_msr;
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
+
+ return 0;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_has_vmx()) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
return 0;
}
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
@@ -2740,7 +2842,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
memset(vmcs, 0, vmcs_config.size);
/* KVM supports Enlightened VMCS v1 only */
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
vmcs->hdr.revision_id = vmcs_config.revision_id;
@@ -2835,7 +2937,7 @@ static __init int alloc_kvm_area(void)
* still be marked with revision_id reported by
* physical CPU.
*/
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
@@ -2922,9 +3024,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2954,8 +3055,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
+ pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
vmx_segment_cache_clear(vmx);
@@ -3412,18 +3512,15 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable || !var->present)
- ar = 1 << 16;
- else {
- ar = var->type & 15;
- ar |= (var->s & 1) << 4;
- ar |= (var->dpl & 3) << 5;
- ar |= (var->present & 1) << 7;
- ar |= (var->avl & 1) << 12;
- ar |= (var->l & 1) << 13;
- ar |= (var->db & 1) << 14;
- ar |= (var->g & 1) << 15;
- }
+ ar = var->type & 15;
+ ar |= (var->s & 1) << 4;
+ ar |= (var->dpl & 3) << 5;
+ ar |= (var->present & 1) << 7;
+ ar |= (var->avl & 1) << 12;
+ ar |= (var->l & 1) << 13;
+ ar |= (var->db & 1) << 14;
+ ar |= (var->g & 1) << 15;
+ ar |= (var->unusable || !var->present) << 16;
return ar;
}
@@ -3775,39 +3872,6 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, ar);
}
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct page *page;
- void __user *hva;
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
- hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (IS_ERR(hva)) {
- ret = PTR_ERR(hva);
- goto out;
- }
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
-}
-
int allocate_vpid(void)
{
int vpid;
@@ -3840,8 +3904,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (kvm_is_using_evmcs()) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
@@ -3922,29 +3991,20 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
-{
- unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
- unsigned long read_intercept;
- int msr;
-
- read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned int read_idx = msr / BITS_PER_LONG;
- unsigned int write_idx = read_idx + (0x800 / sizeof(long));
-
- msr_bitmap[read_idx] = read_intercept;
- msr_bitmap[write_idx] = ~0ul;
- }
-}
-
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
u8 mode;
- if (!cpu_has_vmx_msr_bitmap())
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
return;
if (cpu_has_secondary_exec_ctrls() &&
@@ -3962,7 +4022,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
vmx->x2apic_msr_bitmap_mode = mode;
- vmx_reset_x2apic_msrs(vcpu, mode);
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
/*
* TPR reads and writes can be virtualized even if virtual interrupt
@@ -4431,6 +4502,13 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* controls for features that are/aren't exposed to the guest.
*/
if (nested) {
+ /*
+ * All features that can be added or removed to VMX MSRs must
+ * be supported in the first place for nested virtualization.
+ */
+ if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
+ enabled = false;
+
if (enabled)
vmx->nested.msrs.secondary_ctls_high |= control;
else
@@ -4487,6 +4565,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -4503,7 +4587,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
* it needs to be set here when dirty logging is already active, e.g.
* if this vCPU was created after dirty logging was enabled.
*/
- if (!vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
if (cpu_has_vmx_xsaves()) {
@@ -4662,7 +4746,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
@@ -4938,10 +5022,10 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
- /*
- * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
- * e.g. if the IRQ arrived asynchronously after checking nested events.
- */
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return -EBUSY;
@@ -5052,7 +5136,7 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
return true;
- return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
@@ -5067,8 +5151,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vect_info = vmx->idt_vectoring_info;
intr_info = vmx_get_intr_info(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
if (is_machine_check(intr_info) || is_nmi(intr_info))
- return 1; /* handled by handle_exception_nmi_irqoff() */
+ return 1;
/*
* Queue the exception here instead of in handle_nm_fault_irqoff().
@@ -5384,7 +5473,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
case 3: /* lmsw */
val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
- trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
+ trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val));
kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -6758,17 +6847,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
-{
- bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
- kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
- vmx_do_interrupt_nmi_irqoff(entry);
- kvm_after_interrupt(vcpu);
-}
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
@@ -6790,9 +6870,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6804,9 +6883,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
- /* We need to handle NMIs before interrupts are enabled */
- else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
@@ -6816,10 +6892,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ vmx_do_interrupt_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+
vcpu->arch.at_instruction_boundary = true;
}
@@ -6833,7 +6912,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_nmi_irqoff(vmx);
+ handle_exception_irqoff(vmx);
}
/*
@@ -6844,12 +6923,14 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ return false;
/*
* We cannot do SMM unless we can run the guest in big
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
- case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
case MSR_AMD64_TSC_RATIO:
@@ -7066,9 +7147,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_vmx *vmx,
- unsigned long flags)
+ unsigned int flags)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
@@ -7092,6 +7174,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
+ if (unlikely(vmx->fail))
+ vmx->exit_reason.full = 0xdead;
+ else
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+
+ if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+ is_nmi(vmx_get_intr_info(vcpu))) {
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+ }
+
guest_state_exit_irqoff();
}
@@ -7186,10 +7280,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_wait_lapic_expire(vcpu);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
- if (static_branch_unlikely(&enable_evmcs)) {
+ if (kvm_is_using_evmcs()) {
current_evmcs->hv_clean_fields |=
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
@@ -7233,12 +7327,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->idt_vectoring_info = 0;
- if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- }
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
@@ -7322,7 +7413,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
* feature only for vmcs01, KVM currently isn't equipped to realize any
* performance benefits from enabling it for vmcs02.
*/
- if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ if (kvm_is_using_evmcs() &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
@@ -7352,7 +7443,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
- err = alloc_apic_access_page(vcpu->kvm);
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
@@ -7412,29 +7503,6 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static int __init vmx_check_processor_compat(void)
-{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
-}
-
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -7463,7 +7531,7 @@ static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
@@ -7649,6 +7717,13 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
!guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ if (boot_cpu_has(X86_FEATURE_IBPB))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
+ !guest_has_pred_cmd_msr(vcpu));
+
+ if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
@@ -7669,6 +7744,33 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
+static u64 vmx_get_perf_capabilities(void)
+{
+ u64 perf_cap = PMU_CAP_FW_WRITES;
+ struct x86_pmu_lbr lbr;
+ u64 host_perf_cap = 0;
+
+ if (!enable_pmu)
+ return 0;
+
+ if (boot_cpu_has(X86_FEATURE_PDCM))
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+
+ if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
+ x86_perf_get_lbr(&lbr);
+ if (lbr.nr)
+ perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
+ }
+
+ if (vmx_pebs_supported()) {
+ perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
+ if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+ }
+
+ return perf_cap;
+}
+
static __init void vmx_set_cpu_caps(void)
{
kvm_set_cpu_caps();
@@ -7691,6 +7793,7 @@ static __init void vmx_set_cpu_caps(void)
if (!enable_pmu)
kvm_cpu_cap_clear(X86_FEATURE_PDCM);
+ kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
if (!enable_sgx) {
kvm_cpu_cap_clear(X86_FEATURE_SGX);
@@ -7797,6 +7900,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
/* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
break;
+ case x86_intercept_pause:
+ /*
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
+ * with vanilla NOPs in the emulator. Apply the interception
+ * check only to actual PAUSE instructions. Don't check
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
+ * exit, i.e. KVM is within its rights to allow L2 to execute
+ * the PAUSE.
+ */
+ if ((info->rep_prefix != REPE_PREFIX) ||
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ return X86EMUL_CONTINUE;
+
+ break;
+
/* TODO: check more intercepts... */
default:
break;
@@ -7880,17 +7998,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
return;
}
/*
- * Note, cpu_dirty_logging_count can be changed concurrent with this
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
* code, but in that case another update request will be made and so
* the guest will never run with a stale PML value.
*/
- if (vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
@@ -7906,6 +8027,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEAT_CTL_LMCE_ENABLED;
}
+#ifdef CONFIG_KVM_SMM
static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
@@ -7914,7 +8036,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7935,7 +8057,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
+static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -7960,6 +8082,7 @@ static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
+#endif
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
@@ -7986,17 +8109,16 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
+#define VMX_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
+)
static void vmx_vm_destroy(struct kvm *kvm)
{
@@ -8006,7 +8128,9 @@ static void vmx_vm_destroy(struct kvm *kvm)
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
@@ -8080,7 +8204,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@ -8127,10 +8251,12 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.setup_mce = vmx_setup_mce,
+#ifdef CONFIG_KVM_SMM
.smi_allowed = vmx_smi_allowed,
.enter_smm = vmx_enter_smm,
.leave_smm = vmx_leave_smm,
.enable_smi_window = vmx_enable_smi_window,
+#endif
.can_emulate_instruction = vmx_can_emulate_instruction,
.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
@@ -8224,7 +8350,7 @@ static __init int hardware_setup(void)
return -EIO;
if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
if (boot_cpu_has(X86_FEATURE_NX))
@@ -8232,7 +8358,7 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
@@ -8251,7 +8377,7 @@ static __init int hardware_setup(void)
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
@@ -8286,9 +8412,8 @@ static __init int hardware_setup(void)
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
- vmx_x86_ops.tlb_remote_flush_with_range =
- hv_remote_flush_tlb_with_range;
+ vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8403,9 +8528,6 @@ static __init int hardware_setup(void)
}
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.handle_intel_pt_intr = NULL,
@@ -8423,41 +8545,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
+ kvm_x86_vendor_exit();
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8465,56 +8569,29 @@ static int __init vmx_init(void)
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
-
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
-
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_direct_tlbflush
- = hv_enable_direct_tlbflush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
+ hv_init_evmcs();
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vmx_init_ops);
if (r)
return r;
/*
- * Must be called after kvm_init() so enable_ept is properly set
+ * Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;
vmx_setup_fb_clear_ctrl();
@@ -8538,6 +8615,21 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..9e66531861cf 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -369,7 +369,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 16
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -640,12 +640,30 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
(1 << VCPU_EXREG_EXIT_INFO_1) | \
(1 << VCPU_EXREG_EXIT_INFO_2))
-static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+static inline unsigned long vmx_l1_guest_owned_cr0_bits(void)
+{
+ unsigned long bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+
+ /*
+ * CR0.WP needs to be intercepted when KVM is shadowing legacy paging
+ * in order to construct shadow PTEs with the correct protections.
+ * Note! CR0.WP technically can be passed through to the guest if
+ * paging is disabled, but checking CR0.PG would generate a cyclical
+ * dependency of sorts due to forcing the caller to ensure CR0 holds
+ * the correct value prior to determining which CR0 bits can be owned
+ * by L1. Keep it simple and limit the optimization to EPT.
+ */
+ if (!enable_ept)
+ bits &= ~X86_CR0_WP;
+ return bits;
+}
+
+static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
}
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
@@ -669,25 +687,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
-static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- }
+
return vmx->exit_qualification;
}
-static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- }
+
return vmx->exit_intr_info;
}
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index ec268df83ed6..ce47dc265f89 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -6,19 +6,33 @@
#include <asm/vmx.h>
-#include "evmcs.h"
+#include "hyperv.h"
#include "vmcs.h"
#include "../x86.h"
void vmread_error(unsigned long field, bool fault);
-__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
- bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+/*
+ * The VMREAD error trampoline _always_ uses the stack to pass parameters, even
+ * for 64-bit targets. Preserving all registers allows the VMREAD inline asm
+ * blob to avoid clobbering GPRs, which in turn allows the compiler to better
+ * optimize sequences of VMREADs.
+ *
+ * Declare the trampoline as an opaque label as it's not safe to call from C
+ * code; there is no way to tell the compiler to pass params on the stack for
+ * 64-bit targets.
+ *
+ * void vmread_error_trampoline(unsigned long field, bool fault);
+ */
+extern unsigned long vmread_error_trampoline;
+#endif
+
static __always_inline void vmcs_check16(unsigned long field)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
@@ -86,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
return value;
do_fail:
- WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
- pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ instrumentation_begin();
+ WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ instrumentation_end();
return 0;
do_exception:
@@ -131,7 +147,7 @@ do_exception:
static __always_inline u16 vmcs_read16(unsigned long field)
{
vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read16(field);
return __vmcs_readl(field);
}
@@ -139,7 +155,7 @@ static __always_inline u16 vmcs_read16(unsigned long field)
static __always_inline u32 vmcs_read32(unsigned long field)
{
vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read32(field);
return __vmcs_readl(field);
}
@@ -147,7 +163,7 @@ static __always_inline u32 vmcs_read32(unsigned long field)
static __always_inline u64 vmcs_read64(unsigned long field)
{
vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read64(field);
#ifdef CONFIG_X86_64
return __vmcs_readl(field);
@@ -159,7 +175,7 @@ static __always_inline u64 vmcs_read64(unsigned long field)
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_read64(field);
return __vmcs_readl(field);
}
@@ -206,7 +222,7 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val
static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
vmcs_check16(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write16(field, value);
__vmcs_writel(field, value);
@@ -215,7 +231,7 @@ static __always_inline void vmcs_write16(unsigned long field, u16 value)
static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
vmcs_check32(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, value);
__vmcs_writel(field, value);
@@ -224,7 +240,7 @@ static __always_inline void vmcs_write32(unsigned long field, u32 value)
static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
vmcs_check64(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write64(field, value);
__vmcs_writel(field, value);
@@ -236,7 +252,7 @@ static __always_inline void vmcs_write64(unsigned long field, u64 value)
static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
vmcs_checkl(field);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write64(field, value);
__vmcs_writel(field, value);
@@ -246,7 +262,7 @@ static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_clear_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, evmcs_read32(field) & ~mask);
__vmcs_writel(field, __vmcs_readl(field) & ~mask);
@@ -256,7 +272,7 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
{
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
"vmcs_set_bits does not support 64-bit fields");
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_write32(field, evmcs_read32(field) | mask);
__vmcs_writel(field, __vmcs_readl(field) | mask);
@@ -273,7 +289,7 @@ static inline void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
- if (static_branch_unlikely(&enable_evmcs))
+ if (kvm_is_using_evmcs())
return evmcs_load(phys_addr);
vmx_asm1(vmptrld, "m"(phys_addr), vmcs, phys_addr);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ecea83f0da49..04b57a336b34 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,6 +15,7 @@
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
@@ -30,6 +31,7 @@
#include "hyperv.h"
#include "lapic.h"
#include "xen.h"
+#include "smm.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -58,7 +60,9 @@
#include <linux/mem_encrypt.h>
#include <linux/entry-kvm.h>
#include <linux/suspend.h>
+#include <linux/smp.h>
+#include <trace/events/ipi.h>
#include <trace/events/kvm.h>
#include <asm/debugreg.h>
@@ -119,8 +123,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
-static void process_smi(struct kvm_vcpu *vcpu);
-static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
@@ -129,6 +131,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static DEFINE_MUTEX(vendor_module_lock);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -192,6 +195,10 @@ module_param(enable_pmu, bool, 0444);
bool __read_mostly eager_page_split = true;
module_param(eager_page_split, bool, 0644);
+/* Enable/disable SMT_RSB bug mitigation */
+static bool __read_mostly mitigate_smt_rsb;
+module_param(mitigate_smt_rsb, bool, 0444);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -464,7 +471,6 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
return vcpu->arch.apic_base;
}
-EXPORT_SYMBOL_GPL(kvm_get_apic_base);
enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
{
@@ -492,7 +498,6 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_apic_base);
/*
* Handle a fault on a hardware virtualization (VMX or SVM) instruction.
@@ -628,6 +633,12 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -777,7 +788,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
fault->address);
}
-EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
@@ -794,8 +804,8 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
*/
if ((fault->error_code & PFERR_PRESENT_MASK) &&
!(fault->error_code & PFERR_RSVD_MASK))
- kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
- fault_mmu->root.hpa);
+ kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+ KVM_MMU_ROOT_CURRENT);
fault_mmu->inject_page_fault(vcpu, fault);
}
@@ -806,7 +816,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu)
atomic_inc(&vcpu->arch.nmi_queued);
kvm_make_request(KVM_REQ_NMI, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_inject_nmi);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
@@ -831,11 +840,10 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
}
-EXPORT_SYMBOL_GPL(kvm_require_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
{
- if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE))
return true;
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -900,6 +908,24 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
+ /*
+ * CR0.WP is incorporated into the MMU role, but only for non-nested,
+ * indirect shadow MMUs. If paging is disabled, no updates are needed
+ * as there are no permission bits to emulate. If TDP is enabled, the
+ * MMU's metadata needs to be updated, e.g. so that emulating guest
+ * translations does the right thing, but there's no need to unload the
+ * root as CR0.WP doesn't affect SPTEs.
+ */
+ if ((cr0 ^ old_cr0) == X86_CR0_WP) {
+ if (!(cr0 & X86_CR0_PG))
+ return;
+
+ if (tdp_enabled) {
+ kvm_init_mmu(vcpu);
+ return;
+ }
+ }
+
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
@@ -959,7 +985,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
return 1;
if (!(cr0 & X86_CR0_PG) &&
- (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
+ (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@@ -981,7 +1007,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
@@ -995,7 +1021,7 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
if (static_cpu_has(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
@@ -1009,14 +1035,14 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
+ kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -1172,9 +1198,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
- return 1;
-
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
return 1;
@@ -1221,7 +1244,7 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
* PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
* with PCIDE=0.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))
return;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
@@ -1236,9 +1259,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
bool skip_tlb_flush = false;
unsigned long pcid = 0;
#ifdef CONFIG_X86_64
- bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- if (pcid_enabled) {
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
pcid = cr3 & X86_CR3_PCID_MASK;
@@ -1417,7 +1438,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
* may depend on host virtualization features rather than host cpu features.
*/
-static const u32 msrs_to_save_all[] = {
+static const u32 msrs_to_save_base[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1425,7 +1446,7 @@ static const u32 msrs_to_save_all[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@ -1434,6 +1455,10 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
MSR_IA32_UMWAIT_CONTROL,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
@@ -1458,11 +1483,10 @@ static const u32 msrs_to_save_all[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
static unsigned num_msrs_to_save;
static const u32 emulated_msrs_all[] = {
@@ -1480,7 +1504,7 @@ static const u32 emulated_msrs_all[] = {
HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
HV_X64_MSR_SYNDBG_OPTIONS,
HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
@@ -1534,39 +1558,41 @@ static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
- * List of msr numbers which are used to expose MSR-based features that
- * can be used by a hypervisor to validate requested CPU features.
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
*/
-static const u32 msr_based_features_all[] = {
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR0_FIXED1,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_CR4_FIXED1,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
- MSR_F10H_DECFG,
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
};
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
static unsigned int num_msr_based_features;
/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
+{
+ int i;
+
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
+ return true;
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -1648,6 +1674,9 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
case MSR_IA32_ARCH_CAPABILITIES:
msr->data = kvm_get_arch_capabilities();
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = kvm_caps.supported_perf_cap;
+ break;
case MSR_IA32_UCODE_REV:
rdmsrl_safe(msr->index, &msr->data);
break;
@@ -1682,6 +1711,9 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
+ return false;
+
if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
@@ -2061,7 +2093,6 @@ int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
{
return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_as_nop);
int kvm_emulate_invd(struct kvm_vcpu *vcpu)
{
@@ -2084,7 +2115,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
!guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
- pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
@@ -2178,6 +2209,22 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
+ u64 val;
+
+ /*
+ * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does
+ * not support modifying the guest vCPU model on the fly, e.g. changing
+ * the nVMX capabilities while L2 is running is nonsensical. Ignore
+ * writes of the same value, e.g. to allow userspace to blindly stuff
+ * all MSRs when emulating RESET.
+ */
+ if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) {
+ if (do_get_msr(vcpu, index, &val) || *data != val)
+ return -EINVAL;
+
+ return 0;
+ }
+
return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
@@ -2309,13 +2356,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
- if (system_time & 1) {
- kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
- KVM_HOST_USES_PFN, system_time & ~1ULL,
+ if (system_time & 1)
+ kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
sizeof(struct pvclock_vcpu_time_info));
- } else {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
- }
+ else
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
return;
}
@@ -2433,7 +2478,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
@@ -2507,7 +2553,6 @@ u64 kvm_scale_tsc(u64 tsc, u64 ratio)
return _tsc;
}
-EXPORT_SYMBOL_GPL(kvm_scale_tsc);
static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
@@ -2966,6 +3011,22 @@ static void kvm_update_masterclock(struct kvm *kvm)
kvm_end_pvclock_update(kvm);
}
+/*
+ * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
+ * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
+ * can change during boot even if the TSC is constant, as it's possible for KVM
+ * to be loaded before TSC calibration completes. Ideally, KVM would get a
+ * notification when calibration completes, but practically speaking calibration
+ * will complete before userspace is alive enough to create VMs.
+ */
+static unsigned long get_cpu_tsc_khz(void)
+{
+ if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return tsc_khz;
+ else
+ return __this_cpu_read(cpu_tsc_khz);
+}
+
/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
@@ -2976,7 +3037,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
get_cpu();
data->flags = 0;
- if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+ if (ka->use_master_clock &&
+ (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
#ifdef CONFIG_X86_64
struct timespec64 ts;
@@ -2990,7 +3052,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
data->flags |= KVM_CLOCK_TSC_STABLE;
hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+ kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
@@ -3029,12 +3091,10 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
unsigned long flags;
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock))) {
+ while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- offset + sizeof(*guest_hv_clock)))
+ if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
return;
read_lock_irqsave(&gpc->lock, flags);
@@ -3100,7 +3160,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ tgt_tsc_khz = get_cpu_tsc_khz();
if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -3144,6 +3204,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
+ kvm_xen_update_tsc_info(v);
}
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
@@ -3383,7 +3444,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time);
+ kvm_gpc_deactivate(&vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
@@ -3391,6 +3452,9 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
static_call(kvm_x86_flush_tlb_all)(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
@@ -3409,6 +3473,12 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
}
static_call(kvm_x86_flush_tlb_guest)(vcpu);
+
+ /*
+ * Flushing all "guest" TLB is always a superset of Hyper-V's fine
+ * grained flushing.
+ */
+ kvm_hv_vcpu_purge_flush_tlb(vcpu);
}
@@ -3532,9 +3602,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
+static bool kvm_is_msr_to_save(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -3560,20 +3641,46 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.arch_capabilities = data;
break;
- case MSR_IA32_PERF_CAPABILITIES: {
- struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
-
+ case MSR_IA32_PERF_CAPABILITIES:
if (!msr_info->host_initiated)
return 1;
- if (kvm_get_msr_feature(&msr_ent))
- return 1;
- if (data & ~msr_ent.data)
+ if (data & ~kvm_caps.supported_perf_cap)
return 1;
+ /*
+ * Note, this is not just a performance optimization! KVM
+ * disallows changing feature MSRs after the vCPU has run; PMU
+ * refresh will bug the VM if called after the vCPU has run.
+ */
+ if (vcpu->arch.perf_capabilities == data)
+ break;
+
vcpu->arch.perf_capabilities = data;
kvm_pmu_refresh(vcpu);
- return 0;
- }
+ break;
+ case MSR_IA32_PRED_CMD:
+ if (!msr_info->host_initiated && !guest_has_pred_cmd_msr(vcpu))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_IBPB) || (data & ~PRED_CMD_IBPB))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
+ break;
+ case MSR_IA32_FLUSH_CMD:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ return 1;
+
+ if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
+ return 1;
+ if (!data)
+ break;
+
+ wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -3585,15 +3692,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == BIT_ULL(18)) {
vcpu->arch.msr_hwcr = data;
} else if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
@@ -3645,7 +3750,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
}
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
@@ -3773,16 +3878,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true;
- fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (pr || data != 0)
- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_K7_CLK_CTL:
/*
@@ -3803,15 +3905,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
@@ -3859,20 +3960,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.guest_fpu.xfd_err = data;
break;
#endif
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
+
/*
* Userspace is allowed to write '0' to MSRs that KVM reports
* as to-be-saved, even if an MSRs isn't fully supported.
*/
- return !msr_info->host_initiated || data;
- default:
- if (kvm_pmu_is_valid_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr_info);
+ if (msr_info->host_initiated && !data &&
+ kvm_is_msr_to_save(msr))
+ break;
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -3962,20 +4061,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
- if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (!msr_info->host_initiated)
- return 1;
- msr_info->data = 0;
- break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4061,7 +4146,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.ia32_misc_enable_msr;
break;
case MSR_IA32_SMBASE:
- if (!msr_info->host_initiated)
+ if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
return 1;
msr_info->data = vcpu->arch.smbase;
break;
@@ -4173,6 +4258,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
@@ -4230,6 +4316,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
+
+ /*
+ * Userspace is allowed to read MSRs that KVM reports as
+ * to-be-saved, even if an MSR isn't fully supported.
+ */
+ if (msr_info->host_initiated &&
+ kvm_is_msr_to_save(msr_info->index)) {
+ msr_info->data = 0;
+ break;
+ }
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -4267,8 +4364,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
{
struct kvm_msrs msrs;
struct kvm_msr_entry *entries;
- int r, n;
unsigned size;
+ int r;
r = -EFAULT;
if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
@@ -4285,17 +4382,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
goto out;
}
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
- r = -EFAULT;
if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
+ r = -EFAULT;
-out_free:
kfree(entries);
out:
return r;
@@ -4383,6 +4474,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_PMU_EVENT_FILTER:
+ case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
@@ -4404,6 +4496,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VAPIC:
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
+ case KVM_CAP_IRQFD_RESAMPLE:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4419,7 +4512,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
if (sched_info_on())
- r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
+ r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
+ KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
break;
#endif
case KVM_CAP_SYNC_REGS:
@@ -4429,12 +4523,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
- KVM_X86_DISABLE_EXITS_CSTATE;
- if(kvm_can_mwait_in_guest())
- r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
break;
case KVM_CAP_X86_SMM:
+ if (!IS_ENABLED(CONFIG_KVM_SMM))
+ break;
+
/* SMBASE is usually relocated above 1M on modern chipsets,
* and SMM handlers might indeed rely on 4G segment limits,
* so do not report SMM to be available if real mode is
@@ -4475,7 +4577,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- r = kvm_x86_ops.enable_direct_tlbflush != NULL;
+ r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
@@ -4494,9 +4596,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 0;
break;
case KVM_CAP_XSAVE2: {
- u64 guest_perm = xstate_get_guest_group_perm();
-
- r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
+ r = xstate_required_size(kvm_get_filtered_xcr0(), false);
if (r < sizeof(struct kvm_xsave))
r = sizeof(struct kvm_xsave);
break;
@@ -4891,13 +4991,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
-static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
-{
- kvm_make_request(KVM_REQ_SMI, vcpu);
-
- return 0;
-}
-
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -5003,7 +5096,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
return 0;
if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
- !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
+ !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) {
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
@@ -5033,8 +5126,10 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
process_nmi(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
/*
* KVM's ABI only allows for one exception to be migrated. Luckily,
@@ -5062,16 +5157,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
ex->pending && ex->has_payload)
kvm_deliver_exception_payload(vcpu, ex);
+ memset(events, 0, sizeof(*events));
+
/*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
* again.
*/
- if (kvm_exception_is_soft(ex->vector)) {
- events->exception.injected = 0;
- events->exception.pending = 0;
- } else {
+ if (!kvm_exception_is_soft(ex->vector)) {
events->exception.injected = ex->injected;
events->exception.pending = ex->pending;
/*
@@ -5091,20 +5185,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
- events->interrupt.soft = 0;
events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
- events->nmi.pending = vcpu->arch.nmi_pending != 0;
+ events->nmi.pending = kvm_get_nr_pending_nmis(vcpu);
events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
- events->nmi.pad = 0;
- events->sipi_vector = 0; /* never valid when reporting to user space */
+ /* events->sipi_vector is never valid when reporting to user space */
+#ifdef CONFIG_KVM_SMM
events->smi.smm = is_smm(vcpu);
events->smi.pending = vcpu->arch.smi_pending;
events->smi.smm_inside_nmi =
!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+#endif
events->smi.latched_init = kvm_lapic_latched_init(vcpu);
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
@@ -5116,12 +5210,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
}
-
- memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
-
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
@@ -5185,8 +5275,11 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
- if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
- vcpu->arch.nmi_pending = events->nmi.pending;
+ if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) {
+ vcpu->arch.nmi_pending = 0;
+ atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
+ }
static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
@@ -5194,8 +5287,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+#ifdef CONFIG_KVM_SMM
if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
+ kvm_leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
}
@@ -5208,6 +5302,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
}
+#else
+ if (events->smi.smm || events->smi.pending ||
+ events->smi.smm_inside_nmi)
+ return -EINVAL;
+#endif
+
if (lapic_in_kernel(vcpu)) {
if (events->smi.latched_init)
set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
@@ -5235,12 +5335,11 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5491,10 +5590,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
}
return r;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- if (!kvm_x86_ops.enable_direct_tlbflush)
+ if (!kvm_x86_ops.enable_l2_tlb_flush)
return -ENOTTY;
- return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
case KVM_CAP_HYPERV_ENFORCE_CPUID:
return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
@@ -5574,7 +5673,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SMI: {
- r = kvm_vcpu_ioctl_smi(vcpu);
+ r = kvm_inject_smi(vcpu);
break;
}
case KVM_SET_CPUID: {
@@ -5988,11 +6087,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
-{
- return kvm->arch.n_max_mmu_pages;
-}
-
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
struct kvm_pic *pic = kvm->arch.vpic;
@@ -6208,15 +6302,26 @@ split_irqchip_unlock:
if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
break;
- if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
- kvm_can_mwait_in_guest())
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
+
+#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
+ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
+
+ if (!mitigate_smt_rsb) {
+ if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
+ (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
+ pr_warn_once(SMT_RSB_MSG);
+
+ if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
+ kvm_can_mwait_in_guest())
+ kvm->arch.mwait_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
+ kvm->arch.hlt_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
+ }
+
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
@@ -6233,9 +6338,7 @@ split_irqchip_unlock:
break;
case KVM_CAP_X86_USER_SPACE_MSR:
r = -EINVAL;
- if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
- KVM_MSR_EXIT_REASON_UNKNOWN |
- KVM_MSR_EXIT_REASON_FILTER))
+ if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
break;
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
@@ -6412,7 +6515,7 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (!user_range->nmsrs)
return 0;
- if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
return -EINVAL;
if (!user_range->flags)
@@ -6443,10 +6546,10 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
struct kvm_x86_msr_filter *new_filter, *old_filter;
bool default_allow;
bool empty = true;
- int r = 0;
+ int r;
u32 i;
- if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+ if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
@@ -6469,17 +6572,14 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
}
mutex_lock(&kvm->lock);
-
- /* The per-VM filter is protected by kvm->lock... */
- old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
-
- rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
synchronize_srcu(&kvm->srcu);
kvm_free_msr_filter(old_filter);
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
- mutex_unlock(&kvm->lock);
return 0;
}
@@ -6633,8 +6733,7 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
-long kvm_arch_vm_ioctl(struct file *filp,
- unsigned int ioctl, unsigned long arg)
+int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
@@ -6672,9 +6771,6 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
- case KVM_GET_NR_MMU_PAGES:
- r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
- break;
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -6979,83 +7075,114 @@ out:
return r;
}
-static void kvm_init_msr_list(void)
+static void kvm_probe_feature_msr(u32 msr_index)
+{
+ struct kvm_msr_entry msr = {
+ .index = msr_index,
+ };
+
+ if (kvm_get_msr_feature(&msr))
+ return;
+
+ msr_based_features[num_msr_based_features++] = msr_index;
+}
+
+static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
+
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ case MSR_IA32_TSX_CTRL:
+ if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ return;
+ break;
+ default:
+ break;
+ }
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
+}
+
+static void kvm_init_msr_lists(void)
+{
unsigned i;
BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
- "Please update the fixed PMCs in msrs_to_saved_all[]");
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
num_emulated_msrs = 0;
num_msr_based_features = 0;
- for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
- if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
- continue;
-
- /*
- * Even MSRs that are valid in the host may not be exposed
- * to the guests in some cases.
- */
- switch (msrs_to_save_all[i]) {
- case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
- continue;
- break;
- case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
- !kvm_cpu_cap_has(X86_FEATURE_RDPID))
- continue;
- break;
- case MSR_IA32_UMWAIT_CONTROL:
- if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
- continue;
- break;
- case MSR_IA32_RTIT_CTL:
- case MSR_IA32_RTIT_STATUS:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
- continue;
- break;
- case MSR_IA32_RTIT_CR3_MATCH:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
- continue;
- break;
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
- !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
- continue;
- break;
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
- intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
- continue;
- break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_IA32_XFD:
- case MSR_IA32_XFD_ERR:
- if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
- continue;
- break;
- default:
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
- msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
@@ -7065,15 +7192,11 @@ static void kvm_init_msr_list(void)
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
- struct kvm_msr_entry msr;
+ for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++)
+ kvm_probe_feature_msr(i);
- msr.index = msr_based_features_all[i];
- if (kvm_get_msr_feature(&msr))
- continue;
-
- msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
- }
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++)
+ kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]);
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -7119,8 +7242,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
return handled;
}
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
- struct kvm_segment *var, int seg)
+void kvm_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg)
{
static_call(kvm_x86_set_segment)(vcpu, var, seg);
}
@@ -7156,16 +7279,6 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
}
EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
- struct x86_exception *exception)
-{
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
-
- u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
- access |= PFERR_FETCH_MASK;
- return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
-}
-
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
@@ -7278,15 +7391,6 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
}
-static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
- unsigned long addr, void *val, unsigned int bytes)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
-
- return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
-}
-
static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u64 access,
struct x86_exception *exception)
@@ -7701,7 +7805,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ pr_warn_once("emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
@@ -8078,26 +8182,6 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
- u32 msr_index, u64 data)
-{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
-}
-
-static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- return vcpu->arch.smbase;
-}
-
-static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- vcpu->arch.smbase = smbase;
-}
-
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
@@ -8167,23 +8251,23 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
-static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
{
- return emul_to_vcpu(ctxt)->arch.hflags;
+ return is_smm(emul_to_vcpu(ctxt));
}
-static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
+static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- kvm_smm_changed(vcpu, false);
+ return is_guest_mode(emul_to_vcpu(ctxt));
}
-static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
- const char *smstate)
+#ifndef CONFIG_KVM_SMM
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
{
- return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ WARN_ON_ONCE(1);
+ return X86EMUL_UNHANDLEABLE;
}
+#endif
static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
@@ -8209,7 +8293,6 @@ static const struct x86_emulate_ops emulate_ops = {
.write_gpr = emulator_write_gpr,
.read_std = emulator_read_std,
.write_std = emulator_write_std,
- .read_phys = kvm_read_guest_phys_system,
.fetch = kvm_fetch_guest_virt,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -8229,11 +8312,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
- .get_smbase = emulator_get_smbase,
- .set_smbase = emulator_set_smbase,
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
- .set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
@@ -8247,8 +8327,8 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
- .get_hflags = emulator_get_hflags,
- .exiting_smm = emulator_exiting_smm,
+ .is_smm = emulator_is_smm,
+ .is_guest_mode = emulator_is_guest_mode,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
@@ -8292,7 +8372,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
if (!ctxt) {
- pr_err("kvm: failed to allocate vcpu's emulator\n");
+ pr_err("failed to allocate vcpu's emulator\n");
return NULL;
}
@@ -8320,10 +8400,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
- BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
- BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
-
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
@@ -8456,7 +8532,6 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- bool write_fault_to_shadow_pgtable,
int emulation_type)
{
gpa_t gpa = cr2_or_gpa;
@@ -8527,7 +8602,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* be fixed by unprotecting shadow page and it should
* be reported to userspace.
*/
- return !write_fault_to_shadow_pgtable;
+ return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -8581,29 +8656,6 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
-{
- trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
-
- if (entering_smm) {
- vcpu->arch.hflags |= HF_SMM_MASK;
- } else {
- vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
-
- /* Process a latched INIT or SMI, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-
- /*
- * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
- * on SMM exit we still need to reload them from
- * guest memory
- */
- vcpu->arch.pdptrs_from_userspace = false;
- }
-
- kvm_mmu_reset_context(vcpu);
-}
-
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -8798,20 +8850,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int r;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt;
if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
- /*
- * Clear write_fault_to_shadow_pgtable here to ensure it is
- * never reused.
- */
- write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
- vcpu->arch.write_fault_to_shadow_pgtable = false;
-
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
@@ -8832,10 +8876,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return 1;
}
if (reexecute_instruction(vcpu, cr2_or_gpa,
- write_fault_to_spt,
emulation_type))
return 1;
- if (ctxt->have_exception) {
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
/*
* #UD should result in just EMULATION_FAILED, and trap-like
* exception should not be encountered during decode.
@@ -8909,14 +8954,15 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
- emulation_type))
+ if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
return 1;
return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
+ WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write);
+ vcpu->mmio_needed = false;
r = 1;
inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
@@ -9099,9 +9145,11 @@ static void tsc_khz_changed(void *data)
struct cpufreq_freqs *freq = data;
unsigned long khz = 0;
+ WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
+
if (data)
khz = freq->new;
- else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ else
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
@@ -9122,8 +9170,10 @@ static void kvm_hyperv_tsc_notifier(void)
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
- for_each_present_cpu(cpu)
- per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ for_each_present_cpu(cpu)
+ per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
+ }
kvm_caps.max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -9260,10 +9310,10 @@ static void kvm_timer_init(void)
}
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- }
- cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
- kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
+ }
}
#ifdef CONFIG_X86_64
@@ -9322,35 +9372,66 @@ static struct notifier_block pvclock_gtod_notifier = {
};
#endif
-int kvm_arch_init(void *opaque)
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
- struct kvm_x86_init_ops *ops = opaque;
u64 host_pat;
- int r;
+ int r, cpu;
if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
+ pr_err("inadequate fpu\n");
return -EOPNOTSUPP;
}
@@ -9368,19 +9449,19 @@ int kvm_arch_init(void *opaque)
*/
if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
+ pr_err("host PAT[0] is not WB\n");
return -EIO;
}
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ pr_err("failed to allocate cache for x86 emulator\n");
return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
@@ -9390,13 +9471,37 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_timer_init();
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability(ops->pmu_ops);
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
@@ -9406,8 +9511,35 @@ int kvm_arch_init(void *opaque)
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_lists();
return 0;
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
@@ -9415,24 +9547,39 @@ out_free_x86_emulator_cache:
return r;
}
-void kvm_arch_exit(void)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
+{
+ kvm_unregister_perf_callbacks();
+
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
#endif
kvm_lapic_exit();
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
+ }
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -9440,7 +9587,11 @@ void kvm_arch_exit(void)
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
@@ -9712,7 +9863,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = attrs;
- vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->run->hypercall.flags = 0;
+ if (op_64_bit)
+ vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
return 0;
}
@@ -9805,7 +9960,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
{
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
kvm_x86_ops.nested_ops->triple_fault(vcpu);
return 1;
}
@@ -9815,13 +9970,20 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
+ /*
+ * Suppress the error code if the vCPU is in Real Mode, as Real Mode
+ * exceptions don't report error codes. The presence of an error code
+ * is carried with the exception and only stripped when the exception
+ * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do
+ * report an error code despite the CPU being in Real Mode.
+ */
+ vcpu->arch.exception.has_error_code &= is_protmode(vcpu);
+
trace_kvm_inj_exception(vcpu->arch.exception.vector,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
- vcpu->arch.exception.error_code = false;
static_call(kvm_x86_inject_exception)(vcpu);
}
@@ -9993,6 +10155,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
* in order to make progress and get back here for another iteration.
* The kvm_x86_ops hooks communicate this by returning -EBUSY.
*/
+#ifdef CONFIG_KVM_SMM
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
@@ -10005,6 +10168,7 @@ static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
} else
static_call(kvm_x86_enable_smi_window)(vcpu);
}
+#endif
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
@@ -10065,259 +10229,46 @@ out:
static void process_nmi(struct kvm_vcpu *vcpu)
{
- unsigned limit = 2;
+ unsigned int limit;
/*
- * x86 is limited to one NMI running, and one NMI pending after it.
- * If an NMI is already in progress, limit further NMIs to just one.
- * Otherwise, allow two (and we'll inject the first one immediately).
+ * x86 is limited to one NMI pending, but because KVM can't react to
+ * incoming NMIs as quickly as bare metal, e.g. if the vCPU is
+ * scheduled out, KVM needs to play nice with two queued NMIs showing
+ * up at the same time. To handle this scenario, allow two NMIs to be
+ * (temporarily) pending so long as NMIs are not blocked and KVM is not
+ * waiting for a previous NMI injection to complete (which effectively
+ * blocks NMIs). KVM will immediately inject one of the two NMIs, and
+ * will request an NMI window to handle the second NMI.
*/
if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
-
- vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
- vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
-}
-
-static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
-{
- u32 flags = 0;
- flags |= seg->g << 23;
- flags |= seg->db << 22;
- flags |= seg->l << 21;
- flags |= seg->avl << 20;
- flags |= seg->present << 15;
- flags |= seg->dpl << 13;
- flags |= seg->s << 12;
- flags |= seg->type << 8;
- return flags;
-}
-
-static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
-
- kvm_get_segment(vcpu, &seg, n);
- put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
-
- if (n < 3)
- offset = 0x7f84 + n * 12;
else
- offset = 0x7f2c + (n - 3) * 12;
-
- put_smstate(u32, buf, offset + 8, seg.base);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
-{
- struct kvm_segment seg;
- int offset;
- u16 flags;
-
- kvm_get_segment(vcpu, &seg, n);
- offset = 0x7e00 + n * 16;
-
- flags = enter_smm_get_segment_flags(&seg) >> 8;
- put_smstate(u16, buf, offset, seg.selector);
- put_smstate(u16, buf, offset + 2, flags);
- put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u64, buf, offset + 8, seg.base);
-}
-#endif
-
-static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
- put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
- put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
- put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
-
- for (i = 0; i < 8; i++)
- put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u32, buf, 0x7fcc, (u32)val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u32, buf, 0x7fc8, (u32)val);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u32, buf, 0x7fc4, seg.selector);
- put_smstate(u32, buf, 0x7f64, seg.base);
- put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u32, buf, 0x7fc0, seg.selector);
- put_smstate(u32, buf, 0x7f80, seg.base);
- put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f74, dt.address);
- put_smstate(u32, buf, 0x7f70, dt.size);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7f58, dt.address);
- put_smstate(u32, buf, 0x7f54, dt.size);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_32(vcpu, buf, i);
-
- put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020000);
- put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
-}
-
-#ifdef CONFIG_X86_64
-static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
-{
- struct desc_ptr dt;
- struct kvm_segment seg;
- unsigned long val;
- int i;
-
- for (i = 0; i < 16; i++)
- put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i));
-
- put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
- put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
-
- kvm_get_dr(vcpu, 6, &val);
- put_smstate(u64, buf, 0x7f68, val);
- kvm_get_dr(vcpu, 7, &val);
- put_smstate(u64, buf, 0x7f60, val);
-
- put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
- put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
- put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
-
- put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
-
- /* revision id */
- put_smstate(u32, buf, 0x7efc, 0x00020064);
-
- put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
- put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e94, seg.limit);
- put_smstate(u64, buf, 0x7e98, seg.base);
-
- static_call(kvm_x86_get_idt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e84, dt.size);
- put_smstate(u64, buf, 0x7e88, dt.address);
-
- kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
- put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
- put_smstate(u32, buf, 0x7e74, seg.limit);
- put_smstate(u64, buf, 0x7e78, seg.base);
-
- static_call(kvm_x86_get_gdt)(vcpu, &dt);
- put_smstate(u32, buf, 0x7e64, dt.size);
- put_smstate(u64, buf, 0x7e68, dt.address);
-
- for (i = 0; i < 6; i++)
- enter_smm_save_seg_64(vcpu, buf, i);
-}
-#endif
-
-static void enter_smm(struct kvm_vcpu *vcpu)
-{
- struct kvm_segment cs, ds;
- struct desc_ptr dt;
- unsigned long cr0;
- char buf[512];
-
- memset(buf, 0, 512);
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- enter_smm_save_state_64(vcpu, buf);
- else
-#endif
- enter_smm_save_state_32(vcpu, buf);
+ limit = 2;
/*
- * Give enter_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. leave guest mode) after we've saved the state into the
- * SMM state-save area.
+ * Adjust the limit to account for pending virtual NMIs, which aren't
+ * tracked in vcpu->arch.nmi_pending.
*/
- static_call(kvm_x86_enter_smm)(vcpu, buf);
-
- kvm_smm_changed(vcpu, true);
- kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
-
- if (static_call(kvm_x86_get_nmi_mask)(vcpu))
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- static_call(kvm_x86_set_nmi_mask)(vcpu, true);
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0x8000);
-
- cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- static_call(kvm_x86_set_cr0)(vcpu, cr0);
- vcpu->arch.cr0 = cr0;
-
- static_call(kvm_x86_set_cr4)(vcpu, 0);
-
- /* Undocumented: IDT limit is set to zero on entry to SMM. */
- dt.address = dt.size = 0;
- static_call(kvm_x86_set_idt)(vcpu, &dt);
-
- kvm_set_dr(vcpu, 7, DR7_FIXED_1);
-
- cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
- cs.base = vcpu->arch.smbase;
-
- ds.selector = 0;
- ds.base = 0;
+ if (static_call(kvm_x86_is_vnmi_pending)(vcpu))
+ limit--;
- cs.limit = ds.limit = 0xffffffff;
- cs.type = ds.type = 0x3;
- cs.dpl = ds.dpl = 0;
- cs.db = ds.db = 0;
- cs.s = ds.s = 1;
- cs.l = ds.l = 0;
- cs.g = ds.g = 1;
- cs.avl = ds.avl = 0;
- cs.present = ds.present = 1;
- cs.unusable = ds.unusable = 0;
- cs.padding = ds.padding = 0;
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
- kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
- kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+ if (vcpu->arch.nmi_pending &&
+ (static_call(kvm_x86_set_vnmi_pending)(vcpu)))
+ vcpu->arch.nmi_pending--;
-#ifdef CONFIG_X86_64
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- static_call(kvm_x86_set_efer)(vcpu, 0);
-#endif
-
- kvm_update_cpuid_runtime(vcpu);
- kvm_mmu_reset_context(vcpu);
+ if (vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-static void process_smi(struct kvm_vcpu *vcpu)
+/* Return total number of NMIs pending injection to the VM */
+int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu)
{
- vcpu->arch.smi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return vcpu->arch.nmi_pending +
+ static_call(kvm_x86_is_vnmi_pending)(vcpu);
}
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
@@ -10331,7 +10282,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool activate;
@@ -10366,7 +10317,30 @@ out:
preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+
+static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can the AVIC doorbell to inject interrupts to
+ * running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+}
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set)
@@ -10375,7 +10349,7 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
@@ -10510,20 +10484,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
- /* Forbid vmenter if vcpu dirty ring is soft-full */
- if (unlikely(vcpu->kvm->dirty_ring_size &&
- kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
- vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
- trace_kvm_dirty_ring_exit(vcpu);
- r = 0;
- goto out;
- }
-
if (kvm_request_pending(vcpu)) {
if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
r = -EIO;
goto out;
}
+
+ if (kvm_dirty_ring_check_request(vcpu)) {
+ r = 0;
+ goto out;
+ }
+
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
@@ -10547,23 +10518,37 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
kvm_mmu_load_pgd(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+
+ /*
+ * Note, the order matters here, as flushing "all" TLB entries
+ * also flushes the "current" TLB entries, i.e. servicing the
+ * flush "all" will clear any request to flush "current".
+ */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_vcpu_flush_tlb_all(vcpu);
- /* Flushing all ASIDs flushes the current ASID... */
- kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
kvm_service_local_tlb_flush_requests(vcpu);
+ /*
+ * Fall back to a "full" guest flush if Hyper-V's precise
+ * flushing fails. Note, Hyper-V's flushing is per-vCPU, but
+ * the flushes are considered "remote" and not "local" because
+ * the requests can be initiated from other vCPUs.
+ */
+ if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
+ kvm_hv_vcpu_flush_tlb(vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
- if (is_guest_mode(vcpu)) {
+ if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
+ if (is_guest_mode(vcpu))
kvm_x86_ops.nested_ops->triple_fault(vcpu);
- } else {
+
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
vcpu->mmio_needed = 0;
r = 0;
@@ -10578,8 +10563,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+#ifdef CONFIG_KVM_SMM
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
+#endif
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
@@ -10771,6 +10758,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
+
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
}
/*
@@ -11806,7 +11796,7 @@ static int sync_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ pr_warn_once("SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
if (!kvm->arch.max_vcpu_ids)
@@ -11827,7 +11817,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -11883,7 +11873,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_wbinvd_dirty_mask;
if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- pr_err("kvm: failed to allocate vcpu's fpu\n");
+ pr_err("failed to allocate vcpu's fpu\n");
goto free_emulate_ctxt;
}
@@ -11893,6 +11883,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
kvm_async_pf_hash_reset(vcpu);
+
+ vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
kvm_pmu_init(vcpu);
vcpu->arch.pending_external_vector = -1;
@@ -11997,8 +11989,18 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
WARN_ON_ONCE(!init_event &&
(old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
+ /*
+ * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
+ * possible to INIT the vCPU while L2 is active. Force the vCPU back
+ * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
+ * bits), i.e. virtualization is disabled.
+ */
+ if (is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+
kvm_lapic_reset(vcpu, init_event);
+ WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
vcpu->arch.hflags = 0;
vcpu->arch.smi_pending = 0;
@@ -12145,6 +12147,11 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -12231,93 +12238,10 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
-static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
-{
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-
- kvm_pmu_ops_update(ops->pmu_ops);
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- struct kvm_x86_init_ops *ops = opaque;
- int r;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
- kvm_init_pmu_capability();
-
- r = ops->hardware_setup();
- if (r != 0)
- return r;
-
- kvm_ops_update(ops);
-
- kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
-
- if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- kvm_caps.supported_xss = 0;
-
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
- if (kvm_caps.has_tsc_control) {
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated because it will always
- * be 1 on all machines.
- */
- u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- kvm_caps.max_guest_tsc_khz = max;
- }
- kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_unregister_perf_callbacks();
-
- static_call(kvm_x86_hardware_unsetup)();
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- struct kvm_x86_init_ops *ops = opaque;
-
- WARN_ON(!irqs_disabled());
-
- if (__cr4_reserved_bits(cpu_has, c) !=
- __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- return -EIO;
-
- return ops->check_processor_compatibility();
-}
-
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
@@ -12486,7 +12410,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
*/
hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)hva))
+ if (IS_ERR_VALUE(hva))
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
@@ -12701,16 +12625,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
- struct kvm_arch *ka = &kvm->arch;
+ int nr_slots;
if (!kvm_x86_ops.cpu_dirty_log_size)
return;
- if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
- (!enable && --ka->cpu_dirty_logging_count == 0))
+ nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
+ if ((enable && nr_slots == 1) || !nr_slots)
kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
-
- WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
@@ -12892,10 +12814,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
static_call(kvm_x86_nmi_allowed)(vcpu, false)))
return true;
+#ifdef CONFIG_KVM_SMM
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
(vcpu->arch.smi_pending &&
static_call(kvm_x86_smi_allowed)(vcpu, false)))
return true;
+#endif
if (kvm_arch_interrupt_allowed(vcpu) &&
(kvm_cpu_has_interrupt(vcpu) ||
@@ -12936,7 +12860,9 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
@@ -13392,6 +13318,9 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
struct x86_exception *e)
{
if (r == X86EMUL_PROPAGATE_FAULT) {
+ if (KVM_BUG_ON(!e, vcpu->kvm))
+ return -EIO;
+
kvm_inject_emulated_page_fault(vcpu, e);
return 1;
}
@@ -13428,7 +13357,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+ pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE);
switch (type) {
case INVPCID_TYPE_INDIV_ADDR:
@@ -13713,6 +13642,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
static int __init kvm_x86_init(void)
{
kvm_mmu_x86_module_init();
+ mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
return 0;
}
module_init(kvm_x86_init);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 829d3134c1eb..c544602d07a3 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -3,6 +3,7 @@
#define ARCH_X86_KVM_X86_H
#include <linux/kvm_host.h>
+#include <asm/fpu/xstate.h>
#include <asm/mce.h>
#include <asm/pvclock.h>
#include "kvm_cache_regs.h"
@@ -27,6 +28,7 @@ struct kvm_caps {
u64 supported_mce_cap;
u64 supported_xcr0;
u64 supported_xss;
+ u64 supported_perf_cap;
};
void kvm_spurious_fault(void);
@@ -39,6 +41,14 @@ void kvm_spurious_fault(void);
failed; \
})
+/*
+ * The first...last VMX feature MSRs that are emulated by KVM. This may or may
+ * not cover all known VMX MSRs, as KVM doesn't emulate an MSR until there's an
+ * associated feature that KVM supports for nested virtualization.
+ */
+#define KVM_FIRST_EMULATED_VMX_MSR MSR_IA32_VMX_BASIC
+#define KVM_LAST_EMULATED_VMX_MSR MSR_IA32_VMX_VMFUNC
+
#define KVM_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
#define KVM_DEFAULT_PLE_WINDOW_GROW 2
@@ -82,6 +92,11 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.last_vmentry_cpu != -1;
+}
+
static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
{
return vcpu->arch.exception.pending ||
@@ -122,15 +137,15 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
static inline bool is_protmode(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
+ return kvm_is_cr0_bit_set(vcpu, X86_CR0_PE);
}
-static inline int is_long_mode(struct kvm_vcpu *vcpu)
+static inline bool is_long_mode(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
- return vcpu->arch.efer & EFER_LMA;
+ return !!(vcpu->arch.efer & EFER_LMA);
#else
- return 0;
+ return false;
#endif
}
@@ -170,19 +185,19 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
-static inline int is_pae(struct kvm_vcpu *vcpu)
+static inline bool is_pae(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PAE);
}
-static inline int is_pse(struct kvm_vcpu *vcpu)
+static inline bool is_pse(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_PSE);
}
-static inline int is_paging(struct kvm_vcpu *vcpu)
+static inline bool is_paging(struct kvm_vcpu *vcpu)
{
- return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
+ return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
}
static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
@@ -192,7 +207,7 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
+ return kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? 57 : 48;
}
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
@@ -314,6 +329,34 @@ extern struct kvm_caps kvm_caps;
extern bool enable_pmu;
+/*
+ * Get a filtered version of KVM's supported XCR0 that strips out dynamic
+ * features for which the current process doesn't (yet) have permission to use.
+ * This is intended to be used only when enumerating support to userspace,
+ * e.g. in KVM_GET_SUPPORTED_CPUID and KVM_CAP_XSAVE2, it does NOT need to be
+ * used to check/restrict guest behavior as KVM rejects KVM_SET_CPUID{2} if
+ * userspace attempts to enable unpermitted features.
+ */
+static inline u64 kvm_get_filtered_xcr0(void)
+{
+ u64 permitted_xcr0 = kvm_caps.supported_xcr0;
+
+ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA);
+
+ if (permitted_xcr0 & XFEATURE_MASK_USER_DYNAMIC) {
+ permitted_xcr0 &= xstate_get_guest_group_perm();
+
+ /*
+ * Treat XTILE_CFG as unsupported if the current process isn't
+ * allowed to use XTILE_DATA, as attempting to set XTILE_CFG in
+ * XCR0 without setting XTILE_DATA is architecturally illegal.
+ */
+ if (!(permitted_xcr0 & XFEATURE_MASK_XTILE_DATA))
+ permitted_xcr0 &= ~XFEATURE_MASK_XTILE_CFG;
+ }
+ return permitted_xcr0;
+}
+
static inline bool kvm_mpx_supported(void)
{
return (kvm_caps.supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
@@ -330,6 +373,18 @@ extern bool report_ignored_msrs;
extern bool eager_page_split;
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -381,13 +436,13 @@ enum kvm_intr_type {
KVM_HANDLING_NMI,
};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
- enum kvm_intr_type intr)
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
-static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 2dae413bd62a..40edf4d1974c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -5,6 +5,7 @@
*
* KVM Xen emulation
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "xen.h"
@@ -22,6 +23,9 @@
#include <xen/interface/event_channel.h>
#include <xen/interface/sched.h>
+#include <asm/xen/cpuid.h>
+
+#include "cpuid.h"
#include "trace.h"
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
@@ -41,14 +45,13 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == GPA_INVALID) {
- kvm_gpc_deactivate(kvm, gpc);
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(gpc);
goto out;
}
do {
- ret = kvm_gpc_activate(kvm, gpc, NULL, KVM_HOST_USES_PFN, gpa,
- PAGE_SIZE);
+ ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
if (ret)
goto out;
@@ -170,112 +173,45 @@ static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
vcpu->arch.xen.timer.function = xen_timer_callback;
}
-static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- u64 now = get_kvmclock_ns(v->kvm);
- u64 delta_ns = now - vx->runstate_entry_time;
- u64 run_delay = current->sched_info.run_delay;
-
- if (unlikely(!vx->runstate_entry_time))
- vx->current_runstate = RUNSTATE_offline;
-
- /*
- * Time waiting for the scheduler isn't "stolen" if the
- * vCPU wasn't running anyway.
- */
- if (vx->current_runstate == RUNSTATE_running) {
- u64 steal_ns = run_delay - vx->last_steal;
-
- delta_ns -= steal_ns;
-
- vx->runstate_times[RUNSTATE_runnable] += steal_ns;
- }
- vx->last_steal = run_delay;
-
- vx->runstate_times[vx->current_runstate] += delta_ns;
- vx->current_runstate = state;
- vx->runstate_entry_time = now;
-}
-
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
-{
- struct kvm_vcpu_xen *vx = &v->arch.xen;
- struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
- uint64_t *user_times;
+ struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
+ struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
+ size_t user_len, user_len1, user_len2;
+ struct vcpu_runstate_info rs;
unsigned long flags;
- size_t user_len;
- int *user_state;
-
- kvm_xen_update_runstate(v, state);
-
- if (!vx->runstate_cache.active)
- return;
-
- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_len = sizeof(struct vcpu_runstate_info);
- else
- user_len = sizeof(struct compat_vcpu_runstate_info);
-
- read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- user_len)) {
- read_unlock_irqrestore(&gpc->lock, flags);
-
- /* When invoked from kvm_sched_out() we cannot sleep */
- if (state == RUNSTATE_runnable)
- return;
-
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
- return;
-
- read_lock_irqsave(&gpc->lock, flags);
- }
+ size_t times_ofs;
+ uint8_t *update_bit = NULL;
+ uint64_t entry_time;
+ uint64_t *rs_times;
+ int *rs_state;
/*
* The only difference between 32-bit and 64-bit versions of the
- * runstate struct us the alignment of uint64_t in 32-bit, which
+ * runstate struct is the alignment of uint64_t in 32-bit, which
* means that the 64-bit version has an additional 4 bytes of
- * padding after the first field 'state'.
- *
- * So we use 'int __user *user_state' to point to the state field,
- * and 'uint64_t __user *user_times' for runstate_entry_time. So
- * the actual array of time[] in each state starts at user_times[1].
+ * padding after the first field 'state'. Let's be really really
+ * paranoid about that, and matching it with our internal data
+ * structures that we memcpy into it...
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
#ifdef CONFIG_X86_64
+ /*
+ * The 64-bit structure has 4 bytes of padding before 'state_entry_time'
+ * so each subsequent field is shifted by 4, and it's 4 bytes longer.
+ */
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
+ BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4);
#endif
-
- user_state = gpc->khva;
-
- if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
- user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
- state_entry_time);
- else
- user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
- state_entry_time);
-
/*
- * First write the updated state_entry_time at the appropriate
- * location determined by 'offset'.
- */
- BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
- BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
- sizeof(user_times[0]));
-
- user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
- smp_wmb();
-
- /*
- * Next, write the new runstate. This is in the *same* place
- * for 32-bit and 64-bit guests, asserted here for paranoia.
+ * The state field is in the same place at the start of both structs,
+ * and is the same size (int) as vx->current_runstate.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
offsetof(struct compat_vcpu_runstate_info, state));
@@ -284,34 +220,255 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- *user_state = vx->current_runstate;
+ /*
+ * The state_entry_time field is 64 bits in both versions, and the
+ * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
+ * is little-endian means that it's in the last *byte* of the word.
+ * That detail is important later.
+ */
+ BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
+ sizeof(uint64_t));
+ BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80);
/*
- * Write the actual runstate times immediately after the
- * runstate_entry_time.
+ * The time array is four 64-bit quantities in both versions, matching
+ * the vx->runstate_times and immediately following state_entry_time.
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
- offsetof(struct vcpu_runstate_info, time) - sizeof(u64));
+ offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
- offsetof(struct compat_vcpu_runstate_info, time) - sizeof(u64));
+ offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof_field(struct compat_vcpu_runstate_info, time));
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
- smp_wmb();
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ user_len = sizeof(struct vcpu_runstate_info);
+ times_ofs = offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ } else {
+ user_len = sizeof(struct compat_vcpu_runstate_info);
+ times_ofs = offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+ }
/*
- * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
- * runstate_entry_time field.
+ * There are basically no alignment constraints. The guest can set it
+ * up so it crosses from one page to the next, and at arbitrary byte
+ * alignment (and the 32-bit ABI doesn't align the 64-bit integers
+ * anyway, even if the overall struct had been 64-bit aligned).
*/
- user_times[0] &= ~XEN_RUNSTATE_UPDATE;
+ if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
+ user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
+ user_len2 = user_len - user_len1;
+ } else {
+ user_len1 = user_len;
+ user_len2 = 0;
+ }
+ BUG_ON(user_len1 + user_len2 != user_len);
+
+ retry:
+ /*
+ * Attempt to obtain the GPC lock on *both* (if there are two)
+ * gfn_to_pfn caches that cover the region.
+ */
+ if (atomic) {
+ local_irq_save(flags);
+ if (!read_trylock(&gpc1->lock)) {
+ local_irq_restore(flags);
+ return;
+ }
+ } else {
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+ while (!kvm_gpc_check(gpc1, user_len1)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ if (kvm_gpc_refresh(gpc1, user_len1))
+ return;
+
+ read_lock_irqsave(&gpc1->lock, flags);
+ }
+
+ if (likely(!user_len2)) {
+ /*
+ * Set up three pointers directly to the runstate_info
+ * struct in the guest (via the GPC).
+ *
+ * • @rs_state → state field
+ * • @rs_times → state_entry_time field.
+ * • @update_bit → last byte of state_entry_time, which
+ * contains the XEN_RUNSTATE_UPDATE bit.
+ */
+ rs_state = gpc1->khva;
+ rs_times = gpc1->khva + times_ofs;
+ if (v->kvm->arch.xen.runstate_update_flag)
+ update_bit = ((void *)(&rs_times[1])) - 1;
+ } else {
+ /*
+ * The guest's runstate_info is split across two pages and we
+ * need to hold and validate both GPCs simultaneously. We can
+ * declare a lock ordering GPC1 > GPC2 because nothing else
+ * takes them more than one at a time. Set a subclass on the
+ * gpc1 lock to make lockdep shut up about it.
+ */
+ lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
+ if (atomic) {
+ if (!read_trylock(&gpc2->lock)) {
+ read_unlock_irqrestore(&gpc1->lock, flags);
+ return;
+ }
+ } else {
+ read_lock(&gpc2->lock);
+ }
+
+ if (!kvm_gpc_check(gpc2, user_len2)) {
+ read_unlock(&gpc2->lock);
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (atomic)
+ return;
+
+ /*
+ * Use kvm_gpc_activate() here because if the runstate
+ * area was configured in 32-bit mode and only extends
+ * to the second page now because the guest changed to
+ * 64-bit mode, the second GPC won't have been set up.
+ */
+ if (kvm_gpc_activate(gpc2, gpc1->gpa + user_len1,
+ user_len2))
+ return;
+
+ /*
+ * We dropped the lock on GPC1 so we have to go all the
+ * way back and revalidate that too.
+ */
+ goto retry;
+ }
+
+ /*
+ * In this case, the runstate_info struct will be assembled on
+ * the kernel stack (compat or not as appropriate) and will
+ * be copied to GPC1/GPC2 with a dual memcpy. Set up the three
+ * rs pointers accordingly.
+ */
+ rs_times = &rs.state_entry_time;
+
+ /*
+ * The rs_state pointer points to the start of what we'll
+ * copy to the guest, which in the case of a compat guest
+ * is the 32-bit field that the compiler thinks is padding.
+ */
+ rs_state = ((void *)rs_times) - times_ofs;
+
+ /*
+ * The update_bit is still directly in the guest memory,
+ * via one GPC or the other.
+ */
+ if (v->kvm->arch.xen.runstate_update_flag) {
+ if (user_len1 >= times_ofs + sizeof(uint64_t))
+ update_bit = gpc1->khva + times_ofs +
+ sizeof(uint64_t) - 1;
+ else
+ update_bit = gpc2->khva + times_ofs +
+ sizeof(uint64_t) - 1 - user_len1;
+ }
+
+#ifdef CONFIG_X86_64
+ /*
+ * Don't leak kernel memory through the padding in the 64-bit
+ * version of the struct.
+ */
+ memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time));
+#endif
+ }
+
+ /*
+ * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
+ * state_entry_time field, directly in the guest. We need to set
+ * that (and write-barrier) before writing to the rest of the
+ * structure, and clear it last. Just as Xen does, we address the
+ * single *byte* in which it resides because it might be in a
+ * different cache line to the rest of the 64-bit word, due to
+ * the (lack of) alignment constraints.
+ */
+ entry_time = vx->runstate_entry_time;
+ if (update_bit) {
+ entry_time |= XEN_RUNSTATE_UPDATE;
+ *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56;
+ smp_wmb();
+ }
+
+ /*
+ * Now assemble the actual structure, either on our kernel stack
+ * or directly in the guest according to how the rs_state and
+ * rs_times pointers were set up above.
+ */
+ *rs_state = vx->current_runstate;
+ rs_times[0] = entry_time;
+ memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
+
+ /* For the split case, we have to then copy it to the guest. */
+ if (user_len2) {
+ memcpy(gpc1->khva, rs_state, user_len1);
+ memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
+ }
smp_wmb();
- read_unlock_irqrestore(&gpc->lock, flags);
+ /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */
+ if (update_bit) {
+ entry_time &= ~XEN_RUNSTATE_UPDATE;
+ *update_bit = entry_time >> 56;
+ smp_wmb();
+ }
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ read_unlock(&gpc2->lock);
+
+ read_unlock_irqrestore(&gpc1->lock, flags);
+
+ mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
+ if (user_len2)
+ mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
+}
+
+void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
+{
+ struct kvm_vcpu_xen *vx = &v->arch.xen;
+ u64 now = get_kvmclock_ns(v->kvm);
+ u64 delta_ns = now - vx->runstate_entry_time;
+ u64 run_delay = current->sched_info.run_delay;
+
+ if (unlikely(!vx->runstate_entry_time))
+ vx->current_runstate = RUNSTATE_offline;
+
+ /*
+ * Time waiting for the scheduler isn't "stolen" if the
+ * vCPU wasn't running anyway.
+ */
+ if (vx->current_runstate == RUNSTATE_running) {
+ u64 steal_ns = run_delay - vx->last_steal;
+
+ delta_ns -= steal_ns;
+
+ vx->runstate_times[RUNSTATE_runnable] += steal_ns;
+ }
+ vx->last_steal = run_delay;
+
+ vx->runstate_times[vx->current_runstate] += delta_ns;
+ vx->current_runstate = state;
+ vx->runstate_entry_time = now;
+
+ if (vx->runstate_cache.active)
+ kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
@@ -352,12 +509,10 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
* little more honest about it.
*/
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info)))
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
return;
read_lock_irqsave(&gpc->lock, flags);
@@ -417,8 +572,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
read_lock_irqsave(&gpc->lock, flags);
- while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
read_unlock_irqrestore(&gpc->lock, flags);
/*
@@ -432,8 +586,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (in_atomic() || !task_is_running(current))
return 1;
- if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
- sizeof(struct vcpu_info))) {
+ if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
@@ -458,26 +611,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.upcall_vector = data->u.vector;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
}
break;
@@ -487,9 +640,20 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_XEN_VERSION:
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.xen_version = data->u.xen_version;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ r = 0;
+ break;
+
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
+ mutex_unlock(&kvm->arch.xen.xen_lock);
r = 0;
break;
@@ -504,7 +668,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
@@ -516,7 +680,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
if (kvm->arch.xen.shinfo_cache.active)
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
- data->u.shared_info.gfn = GPA_INVALID;
+ data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
@@ -530,11 +694,20 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
+ if (!sched_info_on()) {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
+ r = 0;
+ break;
+
default:
break;
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return r;
}
@@ -542,7 +715,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int idx, r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
idx = srcu_read_lock(&vcpu->kvm->srcu);
switch (data->type) {
@@ -553,54 +726,80 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_info_cache, NULL,
- KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_info));
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
- if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache);
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- r = kvm_gpc_activate(vcpu->kvm,
- &vcpu->arch.xen.vcpu_time_info_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_time_info_cache,
+ data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
break;
- case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
+ case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
+ size_t sz, sz1, sz2;
+
if (!sched_info_on()) {
r = -EOPNOTSUPP;
break;
}
- if (data->u.gpa == GPA_INVALID) {
- kvm_gpc_deactivate(vcpu->kvm,
- &vcpu->arch.xen.runstate_cache);
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
r = 0;
+ deactivate_out:
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
break;
}
- r = kvm_gpc_activate(vcpu->kvm, &vcpu->arch.xen.runstate_cache,
- NULL, KVM_HOST_USES_PFN, data->u.gpa,
- sizeof(struct vcpu_runstate_info));
- break;
+ /*
+ * If the guest switches to 64-bit mode after setting the runstate
+ * address, that's actually OK. kvm_xen_update_runstate_guest()
+ * will cope.
+ */
+ if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
+ sz = sizeof(struct vcpu_runstate_info);
+ else
+ sz = sizeof(struct compat_vcpu_runstate_info);
+
+ /* How much fits in the (first) page? */
+ sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate_cache,
+ data->u.gpa, sz1);
+ if (r)
+ goto deactivate_out;
+
+ /* Either map the second page, or deactivate the second GPC */
+ if (sz1 >= sz) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ } else {
+ sz2 = sz - sz1;
+ BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
+ r = kvm_gpc_activate(&vcpu->arch.xen.runstate2_cache,
+ data->u.gpa + sz1, sz2);
+ if (r)
+ goto deactivate_out;
+ }
+ kvm_xen_update_runstate_guest(vcpu, false);
+ break;
+ }
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
if (!sched_info_on()) {
r = -EOPNOTSUPP;
@@ -693,6 +892,8 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
if (data->u.runstate.state <= RUNSTATE_offline)
kvm_xen_update_runstate(vcpu, data->u.runstate.state);
+ else if (vcpu->arch.xen.runstate_cache.active)
+ kvm_xen_update_runstate_guest(vcpu, false);
r = 0;
break;
@@ -742,7 +943,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
}
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -750,14 +951,14 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
{
int r = -ENOENT;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
if (vcpu->arch.xen.vcpu_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
- data->u.gpa = GPA_INVALID;
+ data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
@@ -765,7 +966,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
- data->u.gpa = GPA_INVALID;
+ data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
@@ -833,7 +1034,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.xen.xen_lock);
return r;
}
@@ -889,6 +1090,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
: kvm->arch.xen_hvm_config.blob_size_32;
u8 *page;
+ int ret;
if (page_num >= blob_size)
return 1;
@@ -899,10 +1101,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
if (IS_ERR(page))
return PTR_ERR(page);
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
- kfree(page);
+ ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE);
+ kfree(page);
+ if (ret)
return 1;
- }
}
return 0;
}
@@ -925,7 +1127,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
xhc->blob_size_32 || xhc->blob_size_64))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
static_branch_inc(&kvm_xen_enabled.key);
@@ -934,7 +1136,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
return 0;
}
@@ -954,6 +1156,14 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static inline int max_evtchn_port(struct kvm *kvm)
+{
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
+ return EVTCHN_2L_NR_CHANNELS;
+ else
+ return COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+
static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
evtchn_port_t *ports)
{
@@ -964,9 +1174,9 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
bool ret = true;
int idx, i;
- read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
ret = false;
@@ -986,8 +1196,8 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
}
out_rcu:
- srcu_read_unlock(&kvm->srcu, idx);
read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
@@ -995,23 +1205,40 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
u64 param, u64 *r)
{
- int idx, i;
struct sched_poll sched_poll;
evtchn_port_t port, *ports;
- gpa_t gpa;
+ struct x86_exception e;
+ int i;
- if (!longmode || !lapic_in_kernel(vcpu) ||
+ if (!lapic_in_kernel(vcpu) ||
!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
return false;
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
+ struct compat_sched_poll sp32;
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
- sizeof(sched_poll))) {
- *r = -EFAULT;
- return true;
+ /* Sanity check that the compat struct definition is correct */
+ BUILD_BUG_ON(sizeof(sp32) != 16);
+
+ if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /*
+ * This is a 32-bit pointer to an array of evtchn_port_t which
+ * are uint32_t, so once it's converted no further compat
+ * handling is needed.
+ */
+ sched_poll.ports = (void *)(unsigned long)(sp32.ports);
+ sched_poll.nr_ports = sp32.nr_ports;
+ sched_poll.timeout = sp32.timeout;
+ } else {
+ if (kvm_read_guest_virt(vcpu, param, &sched_poll,
+ sizeof(sched_poll), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
}
if (unlikely(sched_poll.nr_ports > 1)) {
@@ -1030,16 +1257,15 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
} else
ports = &port;
+ if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports,
+ sched_poll.nr_ports * sizeof(*ports), &e)) {
+ *r = -EFAULT;
+ return true;
+ }
+
for (i = 0; i < sched_poll.nr_ports; i++) {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu,
- (gva_t)(sched_poll.ports + i),
- NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
- &ports[i], sizeof(port))) {
- *r = -EFAULT;
+ if (ports[i] >= max_evtchn_port(vcpu->kvm)) {
+ *r = -EINVAL;
goto out;
}
}
@@ -1113,9 +1339,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
int vcpu_id, u64 param, u64 *r)
{
struct vcpu_set_singleshot_timer oneshot;
+ struct x86_exception e;
s64 delta;
- gpa_t gpa;
- int idx;
if (!kvm_xen_timer_enabled(vcpu))
return false;
@@ -1126,9 +1351,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
*r = -EINVAL;
return true;
}
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
/*
* The only difference for 32-bit compat is the 4 bytes of
@@ -1146,9 +1368,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
sizeof_field(struct vcpu_set_singleshot_timer, flags));
- if (!gpa ||
- kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
- sizeof(struct compat_vcpu_set_singleshot_timer))) {
+ if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer), &e)) {
*r = -EFAULT;
return true;
}
@@ -1215,6 +1436,7 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
bool longmode;
u64 input, params[6], r = -ENOSYS;
bool handled = false;
+ u8 cpl;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -1242,9 +1464,17 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
params[5] = (u64)kvm_r9_read(vcpu);
}
#endif
- trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ trace_kvm_xen_hypercall(cpl, input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ /*
+ * Only allow hypercall acceleration for CPL0. The rare hypercalls that
+ * are permitted in guest userspace can be handled by the VMM.
+ */
+ if (unlikely(cpl > 0))
+ goto handle_in_userspace;
+
switch (input) {
case __HYPERVISOR_xen_version:
if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
@@ -1279,10 +1509,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
if (handled)
return kvm_xen_hypercall_set_result(vcpu, r);
+handle_in_userspace:
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
- vcpu->run->xen.u.hcall.cpl = static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->run->xen.u.hcall.cpl = cpl;
vcpu->run->xen.u.hcall.input = input;
vcpu->run->xen.u.hcall.params[0] = params[0];
vcpu->run->xen.u.hcall.params[1] = params[1];
@@ -1297,14 +1528,6 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
-static inline int max_evtchn_port(struct kvm *kvm)
-{
- if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
- return EVTCHN_2L_NR_CHANNELS;
- else
- return COMPAT_EVTCHN_2L_NR_CHANNELS;
-}
-
static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
{
int poll_evtchn = vcpu->arch.xen.poll_evtchn;
@@ -1357,7 +1580,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
idx = srcu_read_lock(&kvm->srcu);
read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ if (!kvm_gpc_check(gpc, PAGE_SIZE))
goto out_rcu;
if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
@@ -1391,7 +1614,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
gpc = &vcpu->arch.xen.vcpu_info_cache;
read_lock_irqsave(&gpc->lock, flags);
- if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ if (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
/*
* Could not access the vcpu_info. Set the bit in-kernel
* and prod the vCPU to deliver it for itself.
@@ -1456,15 +1679,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- /*
- * For the irqfd workqueue, using the main kvm->lock mutex is
- * fine since this function is invoked from kvm_set_irq() with
- * no other lock held, no srcu. In future if it will be called
- * directly from a vCPU thread (e.g. on hypercall for an IPI)
- * then it may need to switch to using a leaf-node mutex for
- * serializing the shared_info mapping.
- */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
/*
* It is theoretically possible for the page to be unmapped
@@ -1489,11 +1704,11 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
break;
idx = srcu_read_lock(&kvm->srcu);
- rc = kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpc->gpa, PAGE_SIZE);
+ rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -1606,20 +1821,20 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
{
u32 port = data->u.evtchn.send_port;
struct evtchnfd *evtchnfd;
+ int ret;
- if (!port || port >= max_evtchn_port(kvm))
- return -EINVAL;
-
- mutex_lock(&kvm->lock);
+ /* Protect writes to evtchnfd as well as the idr lookup. */
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ ret = -ENOENT;
if (!evtchnfd)
- return -ENOENT;
+ goto out_unlock;
/* For an UPDATE, nothing may change except the priority/vcpu */
+ ret = -EINVAL;
if (evtchnfd->type != data->u.evtchn.type)
- return -EINVAL;
+ goto out_unlock;
/*
* Port cannot change, and if it's zero that was an eventfd
@@ -1627,20 +1842,21 @@ static int kvm_xen_eventfd_update(struct kvm *kvm,
*/
if (!evtchnfd->deliver.port.port ||
evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
- return -EINVAL;
+ goto out_unlock;
/* We only support 2 level event channels for now */
if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
- return -EINVAL;
+ goto out_unlock;
- mutex_lock(&kvm->lock);
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
evtchnfd->deliver.port.vcpu_idx = -1;
}
- mutex_unlock(&kvm->lock);
- return 0;
+ ret = 0;
+out_unlock:
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return ret;
}
/*
@@ -1652,12 +1868,9 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
{
u32 port = data->u.evtchn.send_port;
struct eventfd_ctx *eventfd = NULL;
- struct evtchnfd *evtchnfd = NULL;
+ struct evtchnfd *evtchnfd;
int ret = -EINVAL;
- if (!port || port >= max_evtchn_port(kvm))
- return -EINVAL;
-
evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
if (!evtchnfd)
return -ENOMEM;
@@ -1705,10 +1918,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm,
evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
}
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
GFP_KERNEL);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (ret >= 0)
return 0;
@@ -1726,15 +1939,14 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
{
struct evtchnfd *evtchnfd;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.xen.xen_lock);
if (!evtchnfd)
return -ENOENT;
- if (kvm)
- synchronize_srcu(&kvm->srcu);
+ synchronize_srcu(&kvm->srcu);
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
@@ -1743,18 +1955,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
static int kvm_xen_eventfd_reset(struct kvm *kvm)
{
- struct evtchnfd *evtchnfd;
+ struct evtchnfd *evtchnfd, **all_evtchnfds;
int i;
+ int n = 0;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.xen.xen_lock);
+
+ /*
+ * Because synchronize_srcu() cannot be called inside the
+ * critical section, first collect all the evtchnfd objects
+ * in an array as they are removed from evtchn_ports.
+ */
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
+ n++;
+
+ all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL);
+ if (!all_evtchnfds) {
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+ return -ENOMEM;
+ }
+
+ n = 0;
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ all_evtchnfds[n++] = evtchnfd;
idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
- synchronize_srcu(&kvm->srcu);
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
+
+ synchronize_srcu(&kvm->srcu);
+
+ while (n--) {
+ evtchnfd = all_evtchnfds[n];
if (!evtchnfd->deliver.port.port)
eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
kfree(evtchnfd);
}
- mutex_unlock(&kvm->lock);
+ kfree(all_evtchnfds);
return 0;
}
@@ -1783,20 +2019,22 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
{
struct evtchnfd *evtchnfd;
struct evtchn_send send;
- gpa_t gpa;
- int idx;
-
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ struct x86_exception e;
- if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+ /* Sanity check: this structure is the same for 32-bit and 64-bit */
+ BUILD_BUG_ON(sizeof(send) != 4);
+ if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) {
*r = -EFAULT;
return true;
}
- /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+ /*
+ * evtchnfd is protected by kvm->srcu; the idr lookup instead
+ * is protected by RCU.
+ */
+ rcu_read_lock();
evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ rcu_read_unlock();
if (!evtchnfd)
return false;
@@ -1819,9 +2057,14 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
- kvm_gpc_init(&vcpu->arch.xen.runstate_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
+ KVM_HOST_USES_PFN);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -1829,17 +2072,42 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
if (kvm_xen_timer_enabled(vcpu))
kvm_xen_stop_timer(vcpu);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.runstate_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
- kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.runstate2_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache);
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+ u32 function;
+
+ if (!vcpu->arch.xen.cpuid.base)
+ return;
+
+ function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
+ if (function > vcpu->arch.xen.cpuid.limit)
+ return;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
+ entry->edx = vcpu->arch.hv_clock.tsc_shift;
+ }
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
+ if (entry)
+ entry->eax = vcpu->arch.hw_tsc_khz;
+}
+
void kvm_xen_init_vm(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
@@ -1847,7 +2115,7 @@ void kvm_xen_destroy_vm(struct kvm *kvm)
struct evtchnfd *evtchnfd;
int i;
- kvm_gpc_deactivate(kvm, &kvm->arch.xen.shinfo_cache);
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
if (!evtchnfd->deliver.port.port)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 532a535a9e99..f8f1fe22d090 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -9,6 +9,8 @@
#ifndef __ARCH_X86_KVM_XEN_H__
#define __ARCH_X86_KVM_XEN_H__
+#include <asm/xen/hypervisor.h>
+
#ifdef CONFIG_KVM_XEN
#include <linux/jump_label_ratelimit.h>
@@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
@@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
{
return false;
}
+
+static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
@@ -143,11 +150,11 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
#include <asm/xen/interface.h>
#include <xen/interface/vcpu.h>
-void kvm_xen_update_runstate_guest(struct kvm_vcpu *vcpu, int state);
+void kvm_xen_update_runstate(struct kvm_vcpu *vcpu, int state);
static inline void kvm_xen_runstate_set_running(struct kvm_vcpu *vcpu)
{
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_running);
+ kvm_xen_update_runstate(vcpu, RUNSTATE_running);
}
static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
@@ -162,7 +169,7 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
if (WARN_ON_ONCE(!vcpu->preempted))
return;
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ kvm_xen_update_runstate(vcpu, RUNSTATE_runnable);
}
/* 32-bit compatibility definitions, also used natively in 32-bit build */
@@ -207,4 +214,11 @@ struct compat_vcpu_runstate_info {
uint64_t time[4];
} __attribute__((packed));
+struct compat_sched_poll {
+ /* This is actually a guest virtual address which points to ports. */
+ uint32_t ports;
+ unsigned int nr_ports;
+ uint64_t timeout;
+};
+
#endif /* __ARCH_X86_KVM_XEN_H__ */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 7ba5f61d7273..01932af64193 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
lib-y += string_32.o
+ lib-y += memmove_32.o
ifneq ($(CONFIG_X86_CMPXCHG64),y)
lib-y += cmpxchg8b_emu.o atomic64_386_32.o
endif
@@ -70,6 +71,6 @@ ifneq ($(CONFIG_GENERIC_CSUM),y)
endif
lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
- lib-y += copy_user_64.o
+ lib-y += copy_user_64.o copy_user_uncached_64.o
lib-y += cmpxchg16b_emu.o
endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index ecbfb4dd3b01..f74a3e704a1c 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -57,134 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input:
* rdi destination
* rcx count
+ * rax is zero
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*/
-SYM_FUNC_START(clear_user_original)
- /*
- * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
- * i.e., no need for a 'q' suffix and thus a REX prefix.
- */
- mov %ecx,%eax
- shr $3,%rcx
- jz .Lrest_bytes
+SYM_FUNC_START(rep_stos_alternative)
+ cmpq $64,%rcx
+ jae .Lunrolled
- # do the qwords first
- .p2align 4
-.Lqwords:
- movq $0,(%rdi)
- lea 8(%rdi),%rdi
- dec %rcx
- jnz .Lqwords
+ cmp $8,%ecx
+ jae .Lword
-.Lrest_bytes:
- and $7, %eax
- jz .Lexit
+ testl %ecx,%ecx
+ je .Lexit
- # now do the rest bytes
-.Lbytes:
- movb $0,(%rdi)
+.Lclear_user_tail:
+0: movb %al,(%rdi)
inc %rdi
- dec %eax
- jnz .Lbytes
-
+ dec %rcx
+ jnz .Lclear_user_tail
.Lexit:
- /*
- * %rax still needs to be cleared in the exception case because this function is called
- * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
- * in case it might reuse it somewhere.
- */
- xor %eax,%eax
- RET
-
-.Lqwords_exception:
- # convert remaining qwords back into bytes to return to caller
- shl $3, %rcx
- and $7, %eax
- add %rax,%rcx
- jmp .Lexit
-
-.Lbytes_exception:
- mov %eax,%ecx
- jmp .Lexit
-
- _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
- _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
-SYM_FUNC_END(clear_user_original)
-EXPORT_SYMBOL(clear_user_original)
-
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
- * present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- */
-SYM_FUNC_START(clear_user_rep_good)
- # call the original thing for less than a cacheline
- cmp $64, %rcx
- jb clear_user_original
-
-.Lprep:
- # copy lower 32-bits for rest bytes
- mov %ecx, %edx
- shr $3, %rcx
- jz .Lrep_good_rest_bytes
-
-.Lrep_good_qwords:
- rep stosq
-
-.Lrep_good_rest_bytes:
- and $7, %edx
- jz .Lrep_good_exit
-
-.Lrep_good_bytes:
- mov %edx, %ecx
- rep stosb
-
-.Lrep_good_exit:
- # see .Lexit comment above
- xor %eax, %eax
RET
-.Lrep_good_qwords_exception:
- # convert remaining qwords back into bytes to return to caller
- shl $3, %rcx
- and $7, %edx
- add %rdx, %rcx
- jmp .Lrep_good_exit
+ _ASM_EXTABLE_UA( 0b, .Lexit)
- _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
- _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
-SYM_FUNC_END(clear_user_rep_good)
-EXPORT_SYMBOL(clear_user_rep_good)
+.Lword:
+1: movq %rax,(%rdi)
+ addq $8,%rdi
+ sub $8,%ecx
+ je .Lexit
+ cmp $8,%ecx
+ jae .Lword
+ jmp .Lclear_user_tail
-/*
- * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
- * Input:
- * rdi destination
- * rcx count
- *
- * Output:
- * rcx: uncleared bytes or 0 if successful.
- *
- */
-SYM_FUNC_START(clear_user_erms)
- # call the original thing for less than a cacheline
- cmp $64, %rcx
- jb clear_user_original
-
-.Lerms_bytes:
- rep stosb
-
-.Lerms_exit:
- xorl %eax,%eax
+ .p2align 4
+.Lunrolled:
+10: movq %rax,(%rdi)
+11: movq %rax,8(%rdi)
+12: movq %rax,16(%rdi)
+13: movq %rax,24(%rdi)
+14: movq %rax,32(%rdi)
+15: movq %rax,40(%rdi)
+16: movq %rax,48(%rdi)
+17: movq %rax,56(%rdi)
+ addq $64,%rdi
+ subq $64,%rcx
+ cmpq $64,%rcx
+ jae .Lunrolled
+ cmpl $8,%ecx
+ jae .Lword
+ testl %ecx,%ecx
+ jne .Lclear_user_tail
RET
- _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
-SYM_FUNC_END(clear_user_erms)
-EXPORT_SYMBOL(clear_user_erms)
+ /*
+ * If we take an exception on any of the
+ * word stores, we know that %rcx isn't zero,
+ * so we can just go to the tail clearing to
+ * get the exact count.
+ *
+ * The unrolled case might end up clearing
+ * some bytes twice. Don't care.
+ *
+ * We could use the value in %rdi to avoid
+ * a second fault on the exact count case,
+ * but do we really care? No.
+ *
+ * Finally, we could try to align %rdi at the
+ * top of the unrolling. But unaligned stores
+ * just aren't that common or expensive.
+ */
+ _ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(10b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(11b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(12b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(13b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(14b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(15b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(16b, .Lclear_user_tail)
+ _ASM_EXTABLE_UA(17b, .Lclear_user_tail)
+SYM_FUNC_END(rep_stos_alternative)
+EXPORT_SYMBOL(rep_stos_alternative)
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index b6da09339308..80570eb3c89b 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -7,16 +7,18 @@
#include <linux/string.h>
#include <linux/ctype.h>
#include <asm/setup.h>
+#include <asm/cmdline.h>
static inline int myisspace(u8 c)
{
return c <= ' '; /* Close enough approximation */
}
-/**
+/*
* Find a boolean option (like quiet,noapic,nosmp....)
*
* @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
* @option: option string to look for
*
* Returns the position of that @option (starts counting with 1)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 9dec1b38a98f..01c5de4c279b 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -7,404 +7,116 @@
*/
#include <linux/linkage.h>
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
#include <asm/asm.h>
-#include <asm/smap.h>
#include <asm/export.h>
-#include <asm/trapnr.h>
-
-.macro ALIGN_DESTINATION
- /* check for bad alignment of destination */
- movl %edi,%ecx
- andl $7,%ecx
- jz 102f /* already aligned */
- subl $8,%ecx
- negl %ecx
- subl %ecx,%edx
-100: movb (%rsi),%al
-101: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 100b
-102:
-
- _ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
- _ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
-.endm
/*
- * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro
- * code for rep movsq
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_generic_unrolled)
- ASM_STAC
- cmpl $8,%edx
- jb .Lcopy_user_short_string_bytes
- ALIGN_DESTINATION
- movl %edx,%ecx
- andl $63,%edx
- shrl $6,%ecx
- jz copy_user_short_string
-1: movq (%rsi),%r8
-2: movq 1*8(%rsi),%r9
-3: movq 2*8(%rsi),%r10
-4: movq 3*8(%rsi),%r11
-5: movq %r8,(%rdi)
-6: movq %r9,1*8(%rdi)
-7: movq %r10,2*8(%rdi)
-8: movq %r11,3*8(%rdi)
-9: movq 4*8(%rsi),%r8
-10: movq 5*8(%rsi),%r9
-11: movq 6*8(%rsi),%r10
-12: movq 7*8(%rsi),%r11
-13: movq %r8,4*8(%rdi)
-14: movq %r9,5*8(%rdi)
-15: movq %r10,6*8(%rdi)
-16: movq %r11,7*8(%rdi)
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- decl %ecx
- jnz 1b
- jmp copy_user_short_string
-
-30: shll $6,%ecx
- addl %ecx,%edx
- jmp .Lcopy_user_handle_tail
-
- _ASM_EXTABLE_CPY(1b, 30b)
- _ASM_EXTABLE_CPY(2b, 30b)
- _ASM_EXTABLE_CPY(3b, 30b)
- _ASM_EXTABLE_CPY(4b, 30b)
- _ASM_EXTABLE_CPY(5b, 30b)
- _ASM_EXTABLE_CPY(6b, 30b)
- _ASM_EXTABLE_CPY(7b, 30b)
- _ASM_EXTABLE_CPY(8b, 30b)
- _ASM_EXTABLE_CPY(9b, 30b)
- _ASM_EXTABLE_CPY(10b, 30b)
- _ASM_EXTABLE_CPY(11b, 30b)
- _ASM_EXTABLE_CPY(12b, 30b)
- _ASM_EXTABLE_CPY(13b, 30b)
- _ASM_EXTABLE_CPY(14b, 30b)
- _ASM_EXTABLE_CPY(15b, 30b)
- _ASM_EXTABLE_CPY(16b, 30b)
-SYM_FUNC_END(copy_user_generic_unrolled)
-EXPORT_SYMBOL(copy_user_generic_unrolled)
-
-/* Some CPUs run faster using the string copy instructions.
- * This is also a lot simpler. Use them when possible.
- *
- * Only 4GB of copy is supported. This shouldn't be a problem
- * because the kernel normally only writes from/to page sized chunks
- * even if user space passed a longer buffer.
- * And more would be dangerous because both Intel and AMD have
- * errata with rep movsq > 4GB. If someone feels the need to fix
- * this please consider this.
+ * rep_movs_alternative - memory copy with exception handling.
+ * This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
*
* Input:
* rdi destination
* rsi source
- * rdx count
+ * rcx count
*
* Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_FUNC_START(copy_user_generic_string)
- ASM_STAC
- cmpl $8,%edx
- jb 2f /* less than 8 bytes, go to byte copy loop */
- ALIGN_DESTINATION
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
-1: rep movsq
-2: movl %edx,%ecx
-3: rep movsb
- xorl %eax,%eax
- ASM_CLAC
- RET
-
-11: leal (%rdx,%rcx,8),%ecx
-12: movl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
-
- _ASM_EXTABLE_CPY(1b, 11b)
- _ASM_EXTABLE_CPY(3b, 12b)
-SYM_FUNC_END(copy_user_generic_string)
-EXPORT_SYMBOL(copy_user_generic_string)
-
-/*
- * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
- * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count
+ * rcx uncopied bytes or 0 if successful.
*
- * Output:
- * eax uncopied bytes or 0 if successful.
+ * NOTE! The calling convention is very intentionally the same as
+ * for 'rep movs', so that we can rewrite the function call with
+ * just a plain 'rep movs' on machines that have FSRM. But to make
+ * it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
*/
-SYM_FUNC_START(copy_user_enhanced_fast_string)
- ASM_STAC
- /* CPUs without FSRM should avoid rep movsb for short copies */
- ALTERNATIVE "cmpl $64, %edx; jb copy_user_short_string", "", X86_FEATURE_FSRM
- movl %edx,%ecx
-1: rep movsb
- xorl %eax,%eax
- ASM_CLAC
+SYM_FUNC_START(rep_movs_alternative)
+ cmpq $64,%rcx
+ jae .Llarge
+
+ cmp $8,%ecx
+ jae .Lword
+
+ testl %ecx,%ecx
+ je .Lexit
+
+.Lcopy_user_tail:
+0: movb (%rsi),%al
+1: movb %al,(%rdi)
+ inc %rdi
+ inc %rsi
+ dec %rcx
+ jne .Lcopy_user_tail
+.Lexit:
RET
-12: movl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
-
- _ASM_EXTABLE_CPY(1b, 12b)
-SYM_FUNC_END(copy_user_enhanced_fast_string)
-EXPORT_SYMBOL(copy_user_enhanced_fast_string)
-
-/*
- * Try to copy last bytes and clear the rest if needed.
- * Since protection fault in copy_from/to_user is not a normal situation,
- * it is not necessary to optimize tail handling.
- * Don't try to copy the tail if machine check happened
- *
- * Input:
- * eax trap number written by ex_handler_copy()
- * rdi destination
- * rsi source
- * rdx count
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
- cmp $X86_TRAP_MC,%eax
- je 3f
-
- movl %edx,%ecx
-1: rep movsb
-2: mov %ecx,%eax
- ASM_CLAC
+ _ASM_EXTABLE_UA( 0b, .Lexit)
+ _ASM_EXTABLE_UA( 1b, .Lexit)
+
+ .p2align 4
+.Lword:
+2: movq (%rsi),%rax
+3: movq %rax,(%rdi)
+ addq $8,%rsi
+ addq $8,%rdi
+ sub $8,%ecx
+ je .Lexit
+ cmp $8,%ecx
+ jae .Lword
+ jmp .Lcopy_user_tail
+
+ _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+
+.Llarge:
+0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS
+1: RET
+
+ _ASM_EXTABLE_UA( 0b, 1b)
+
+ .p2align 4
+.Lunrolled:
+10: movq (%rsi),%r8
+11: movq 8(%rsi),%r9
+12: movq 16(%rsi),%r10
+13: movq 24(%rsi),%r11
+14: movq %r8,(%rdi)
+15: movq %r9,8(%rdi)
+16: movq %r10,16(%rdi)
+17: movq %r11,24(%rdi)
+20: movq 32(%rsi),%r8
+21: movq 40(%rsi),%r9
+22: movq 48(%rsi),%r10
+23: movq 56(%rsi),%r11
+24: movq %r8,32(%rdi)
+25: movq %r9,40(%rdi)
+26: movq %r10,48(%rdi)
+27: movq %r11,56(%rdi)
+ addq $64,%rsi
+ addq $64,%rdi
+ subq $64,%rcx
+ cmpq $64,%rcx
+ jae .Lunrolled
+ cmpl $8,%ecx
+ jae .Lword
+ testl %ecx,%ecx
+ jne .Lcopy_user_tail
RET
-3:
- movl %edx,%eax
- ASM_CLAC
- RET
-
- _ASM_EXTABLE_CPY(1b, 2b)
-
-.Lcopy_user_handle_align:
- addl %ecx,%edx /* ecx is zerorest also */
- jmp .Lcopy_user_handle_tail
-
-SYM_CODE_END(.Lcopy_user_handle_tail)
-
-/*
- * Finish memcpy of less than 64 bytes. #AC should already be set.
- *
- * Input:
- * rdi destination
- * rsi source
- * rdx count (< 64)
- *
- * Output:
- * eax uncopied bytes or 0 if successful.
- */
-SYM_CODE_START_LOCAL(copy_user_short_string)
- movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz .Lcopy_user_short_string_bytes
-18: movq (%rsi),%r8
-19: movq %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz 18b
-.Lcopy_user_short_string_bytes:
- andl %edx,%edx
- jz 23f
- movl %edx,%ecx
-21: movb (%rsi),%al
-22: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz 21b
-23: xor %eax,%eax
- ASM_CLAC
- RET
-
-40: leal (%rdx,%rcx,8),%edx
- jmp 60f
-50: movl %ecx,%edx /* ecx is zerorest also */
-60: jmp .Lcopy_user_handle_tail
-
- _ASM_EXTABLE_CPY(18b, 40b)
- _ASM_EXTABLE_CPY(19b, 40b)
- _ASM_EXTABLE_CPY(21b, 50b)
- _ASM_EXTABLE_CPY(22b, 50b)
-SYM_CODE_END(copy_user_short_string)
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination out of cache for more performance.
- *
- * Note: Cached memory copy is used when destination or size is not
- * naturally aligned. That is:
- * - Require 8-byte alignment when size is 8 bytes or larger.
- * - Require 4-byte alignment when size is 4 bytes.
- */
-SYM_FUNC_START(__copy_user_nocache)
- ASM_STAC
-
- /* If size is less than 8 bytes, go to 4-byte copy */
- cmpl $8,%edx
- jb .L_4b_nocache_copy_entry
-
- /* If destination is not 8-byte aligned, "cache" copy to align it */
- ALIGN_DESTINATION
-
- /* Set 4x8-byte copy count and remainder */
- movl %edx,%ecx
- andl $63,%edx
- shrl $6,%ecx
- jz .L_8b_nocache_copy_entry /* jump if count is 0 */
-
- /* Perform 4x8-byte nocache loop-copy */
-.L_4x8b_nocache_copy_loop:
-1: movq (%rsi),%r8
-2: movq 1*8(%rsi),%r9
-3: movq 2*8(%rsi),%r10
-4: movq 3*8(%rsi),%r11
-5: movnti %r8,(%rdi)
-6: movnti %r9,1*8(%rdi)
-7: movnti %r10,2*8(%rdi)
-8: movnti %r11,3*8(%rdi)
-9: movq 4*8(%rsi),%r8
-10: movq 5*8(%rsi),%r9
-11: movq 6*8(%rsi),%r10
-12: movq 7*8(%rsi),%r11
-13: movnti %r8,4*8(%rdi)
-14: movnti %r9,5*8(%rdi)
-15: movnti %r10,6*8(%rdi)
-16: movnti %r11,7*8(%rdi)
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- decl %ecx
- jnz .L_4x8b_nocache_copy_loop
-
- /* Set 8-byte copy count and remainder */
-.L_8b_nocache_copy_entry:
- movl %edx,%ecx
- andl $7,%edx
- shrl $3,%ecx
- jz .L_4b_nocache_copy_entry /* jump if count is 0 */
-
- /* Perform 8-byte nocache loop-copy */
-.L_8b_nocache_copy_loop:
-20: movq (%rsi),%r8
-21: movnti %r8,(%rdi)
- leaq 8(%rsi),%rsi
- leaq 8(%rdi),%rdi
- decl %ecx
- jnz .L_8b_nocache_copy_loop
-
- /* If no byte left, we're done */
-.L_4b_nocache_copy_entry:
- andl %edx,%edx
- jz .L_finish_copy
-
- /* If destination is not 4-byte aligned, go to byte copy: */
- movl %edi,%ecx
- andl $3,%ecx
- jnz .L_1b_cache_copy_entry
-
- /* Set 4-byte copy count (1 or 0) and remainder */
- movl %edx,%ecx
- andl $3,%edx
- shrl $2,%ecx
- jz .L_1b_cache_copy_entry /* jump if count is 0 */
-
- /* Perform 4-byte nocache copy: */
-30: movl (%rsi),%r8d
-31: movnti %r8d,(%rdi)
- leaq 4(%rsi),%rsi
- leaq 4(%rdi),%rdi
-
- /* If no bytes left, we're done: */
- andl %edx,%edx
- jz .L_finish_copy
-
- /* Perform byte "cache" loop-copy for the remainder */
-.L_1b_cache_copy_entry:
- movl %edx,%ecx
-.L_1b_cache_copy_loop:
-40: movb (%rsi),%al
-41: movb %al,(%rdi)
- incq %rsi
- incq %rdi
- decl %ecx
- jnz .L_1b_cache_copy_loop
-
- /* Finished copying; fence the prior stores */
-.L_finish_copy:
- xorl %eax,%eax
- ASM_CLAC
- sfence
- RET
-
-.L_fixup_4x8b_copy:
- shll $6,%ecx
- addl %ecx,%edx
- jmp .L_fixup_handle_tail
-.L_fixup_8b_copy:
- lea (%rdx,%rcx,8),%rdx
- jmp .L_fixup_handle_tail
-.L_fixup_4b_copy:
- lea (%rdx,%rcx,4),%rdx
- jmp .L_fixup_handle_tail
-.L_fixup_1b_copy:
- movl %ecx,%edx
-.L_fixup_handle_tail:
- sfence
- jmp .Lcopy_user_handle_tail
-
- _ASM_EXTABLE_CPY(1b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(2b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(3b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(4b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(5b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(6b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(7b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(8b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(9b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(10b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(11b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(12b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(13b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(14b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(15b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(16b, .L_fixup_4x8b_copy)
- _ASM_EXTABLE_CPY(20b, .L_fixup_8b_copy)
- _ASM_EXTABLE_CPY(21b, .L_fixup_8b_copy)
- _ASM_EXTABLE_CPY(30b, .L_fixup_4b_copy)
- _ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
- _ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
- _ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
-SYM_FUNC_END(__copy_user_nocache)
-EXPORT_SYMBOL(__copy_user_nocache)
+ _ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
+SYM_FUNC_END(rep_movs_alternative)
+EXPORT_SYMBOL(rep_movs_alternative)
diff --git a/arch/x86/lib/copy_user_uncached_64.S b/arch/x86/lib/copy_user_uncached_64.S
new file mode 100644
index 000000000000..5c5f38d32672
--- /dev/null
+++ b/arch/x86/lib/copy_user_uncached_64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm/export.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ *
+ * This copies from user space into kernel space, but the kernel
+ * space accesses can take a machine check exception, so they too
+ * need exception handling.
+ *
+ * Note: only 32-bit and 64-bit stores have non-temporal versions,
+ * and we only use aligned versions. Any unaligned parts at the
+ * start or end of the copy will be done using normal cached stores.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * edx count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__copy_user_nocache)
+ /* If destination is not 7-byte aligned, we'll have to align it */
+ testb $7,%dil
+ jne .Lalign
+
+.Lis_aligned:
+ cmp $64,%edx
+ jb .Lquadwords
+
+ .p2align 4,0x90
+.Lunrolled:
+10: movq (%rsi),%r8
+11: movq 8(%rsi),%r9
+12: movq 16(%rsi),%r10
+13: movq 24(%rsi),%r11
+20: movnti %r8,(%rdi)
+21: movnti %r9,8(%rdi)
+22: movnti %r10,16(%rdi)
+23: movnti %r11,24(%rdi)
+30: movq 32(%rsi),%r8
+31: movq 40(%rsi),%r9
+32: movq 48(%rsi),%r10
+33: movq 56(%rsi),%r11
+40: movnti %r8,32(%rdi)
+41: movnti %r9,40(%rdi)
+42: movnti %r10,48(%rdi)
+43: movnti %r11,56(%rdi)
+
+ addq $64,%rsi
+ addq $64,%rdi
+ sub $64,%edx
+ cmp $64,%edx
+ jae .Lunrolled
+
+/*
+ * First set of user mode loads have been done
+ * without any stores, so if they fail, we can
+ * just try the non-unrolled loop.
+ */
+_ASM_EXTABLE_UA(10b, .Lquadwords)
+_ASM_EXTABLE_UA(11b, .Lquadwords)
+_ASM_EXTABLE_UA(12b, .Lquadwords)
+_ASM_EXTABLE_UA(13b, .Lquadwords)
+
+/*
+ * The second set of user mode loads have been
+ * done with 32 bytes stored to the destination,
+ * so we need to take that into account before
+ * falling back to the unrolled loop.
+ */
+_ASM_EXTABLE_UA(30b, .Lfixup32)
+_ASM_EXTABLE_UA(31b, .Lfixup32)
+_ASM_EXTABLE_UA(32b, .Lfixup32)
+_ASM_EXTABLE_UA(33b, .Lfixup32)
+
+/*
+ * An exception on a write means that we're
+ * done, but we need to update the count
+ * depending on where in the unrolled loop
+ * we were.
+ */
+_ASM_EXTABLE_UA(20b, .Ldone0)
+_ASM_EXTABLE_UA(21b, .Ldone8)
+_ASM_EXTABLE_UA(22b, .Ldone16)
+_ASM_EXTABLE_UA(23b, .Ldone24)
+_ASM_EXTABLE_UA(40b, .Ldone32)
+_ASM_EXTABLE_UA(41b, .Ldone40)
+_ASM_EXTABLE_UA(42b, .Ldone48)
+_ASM_EXTABLE_UA(43b, .Ldone56)
+
+.Lquadwords:
+ cmp $8,%edx
+ jb .Llong
+50: movq (%rsi),%rax
+51: movnti %rax,(%rdi)
+ addq $8,%rsi
+ addq $8,%rdi
+ sub $8,%edx
+ jmp .Lquadwords
+
+/*
+ * If we fail on the last full quadword, we will
+ * not try to do any byte-wise cached accesses.
+ * We will try to do one more 4-byte uncached
+ * one, though.
+ */
+_ASM_EXTABLE_UA(50b, .Llast4)
+_ASM_EXTABLE_UA(51b, .Ldone0)
+
+.Llong:
+ test $4,%dl
+ je .Lword
+60: movl (%rsi),%eax
+61: movnti %eax,(%rdi)
+ addq $4,%rsi
+ addq $4,%rdi
+ sub $4,%edx
+.Lword:
+ sfence
+ test $2,%dl
+ je .Lbyte
+70: movw (%rsi),%ax
+71: movw %ax,(%rdi)
+ addq $2,%rsi
+ addq $2,%rdi
+ sub $2,%edx
+.Lbyte:
+ test $1,%dl
+ je .Ldone
+80: movb (%rsi),%al
+81: movb %al,(%rdi)
+ dec %edx
+.Ldone:
+ mov %edx,%eax
+ RET
+
+/*
+ * If we fail on the last four bytes, we won't
+ * bother with any fixups. It's dead, Jim. Note
+ * that there's no need for 'sfence' for any
+ * of this, since the exception will have been
+ * serializing.
+ */
+_ASM_EXTABLE_UA(60b, .Ldone)
+_ASM_EXTABLE_UA(61b, .Ldone)
+_ASM_EXTABLE_UA(70b, .Ldone)
+_ASM_EXTABLE_UA(71b, .Ldone)
+_ASM_EXTABLE_UA(80b, .Ldone)
+_ASM_EXTABLE_UA(81b, .Ldone)
+
+/*
+ * This is the "head needs aliging" case when
+ * the destination isn't 8-byte aligned. The
+ * 4-byte case can be done uncached, but any
+ * smaller alignment is done with regular stores.
+ */
+.Lalign:
+ test $1,%dil
+ je .Lalign_word
+ test %edx,%edx
+ je .Ldone
+90: movb (%rsi),%al
+91: movb %al,(%rdi)
+ inc %rsi
+ inc %rdi
+ dec %edx
+.Lalign_word:
+ test $2,%dil
+ je .Lalign_long
+ cmp $2,%edx
+ jb .Lbyte
+92: movw (%rsi),%ax
+93: movw %ax,(%rdi)
+ addq $2,%rsi
+ addq $2,%rdi
+ sub $2,%edx
+.Lalign_long:
+ test $4,%dil
+ je .Lis_aligned
+ cmp $4,%edx
+ jb .Lword
+94: movl (%rsi),%eax
+95: movnti %eax,(%rdi)
+ addq $4,%rsi
+ addq $4,%rdi
+ sub $4,%edx
+ jmp .Lis_aligned
+
+/*
+ * If we fail on the initial alignment accesses,
+ * we're all done. Again, no point in trying to
+ * do byte-by-byte probing if the 4-byte load
+ * fails - we're not doing any uncached accesses
+ * any more.
+ */
+_ASM_EXTABLE_UA(90b, .Ldone)
+_ASM_EXTABLE_UA(91b, .Ldone)
+_ASM_EXTABLE_UA(92b, .Ldone)
+_ASM_EXTABLE_UA(93b, .Ldone)
+_ASM_EXTABLE_UA(94b, .Ldone)
+_ASM_EXTABLE_UA(95b, .Ldone)
+
+/*
+ * Exception table fixups for faults in the middle
+ */
+.Ldone56: sub $8,%edx
+.Ldone48: sub $8,%edx
+.Ldone40: sub $8,%edx
+.Ldone32: sub $8,%edx
+.Ldone24: sub $8,%edx
+.Ldone16: sub $8,%edx
+.Ldone8: sub $8,%edx
+.Ldone0:
+ mov %edx,%eax
+ RET
+
+.Lfixup32:
+ addq $32,%rsi
+ addq $32,%rdi
+ sub $32,%edx
+ jmp .Lquadwords
+
+.Llast4:
+52: movl (%rsi),%eax
+53: movnti %eax,(%rdi)
+ sfence
+ sub $4,%edx
+ mov %edx,%eax
+ RET
+_ASM_EXTABLE_UA(52b, .Ldone0)
+_ASM_EXTABLE_UA(53b, .Ldone0)
+
+SYM_FUNC_END(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index 1e3de0769b81..b5a6d83106bc 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -11,6 +11,7 @@ asm(
".text\n"
".type just_return_func, @function\n"
".globl just_return_func\n"
+ ASM_FUNC_ALIGN
"just_return_func:\n"
ANNOTATE_NOENDBR
ASM_RET
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index b70d98d79a9d..b64a2bd1a1ef 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -37,22 +37,22 @@
#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
-#endif
+.macro check_range size:req
+.if IS_ENABLED(CONFIG_X86_64)
+ mov %rax, %rdx
+ sar $63, %rdx
+ or %rdx, %rax
+.else
+ cmp $TASK_SIZE_MAX-\size+1, %eax
+ jae .Lbad_get_user
+ sbb %edx, %edx /* array_index_mask_nospec() */
+ and %edx, %eax
+.endif
+.endm
.text
SYM_FUNC_START(__get_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=1
ASM_STAC
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
@@ -62,11 +62,7 @@ SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
SYM_FUNC_START(__get_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=2
ASM_STAC
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
@@ -76,11 +72,7 @@ SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
SYM_FUNC_START(__get_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=4
ASM_STAC
3: movl (%_ASM_AX),%edx
xor %eax,%eax
@@ -90,30 +82,17 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
-#ifdef CONFIG_X86_64
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=8
ASM_STAC
+#ifdef CONFIG_X86_64
4: movq (%_ASM_AX),%rdx
- xor %eax,%eax
- ASM_CLAC
- RET
#else
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user_8
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
- ASM_STAC
4: movl (%_ASM_AX),%edx
5: movl 4(%_ASM_AX),%ecx
+#endif
xor %eax,%eax
ASM_CLAC
RET
-#endif
SYM_FUNC_END(__get_user_8)
EXPORT_SYMBOL(__get_user_8)
@@ -166,7 +145,7 @@ EXPORT_SYMBOL(__get_user_nocheck_8)
SYM_CODE_START_LOCAL(.Lbad_get_user_clac)
ASM_CLAC
-bad_get_user:
+.Lbad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
RET
@@ -184,23 +163,23 @@ SYM_CODE_END(.Lbad_get_user_8_clac)
#endif
/* get_user */
- _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(1b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(2b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(3b, .Lbad_get_user_clac)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(4b, .Lbad_get_user_clac)
#else
- _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac)
- _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(4b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(5b, .Lbad_get_user_8_clac)
#endif
/* __get_user */
- _ASM_EXTABLE_UA(6b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(7b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(8b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(6b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(7b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(8b, .Lbad_get_user_clac)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(9b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(9b, .Lbad_get_user_clac)
#else
- _ASM_EXTABLE_UA(9b, .Lbad_get_user_8_clac)
- _ASM_EXTABLE_UA(10b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(9b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(10b, .Lbad_get_user_8_clac)
#endif
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 21104c41cba0..558a605929db 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -1595,16 +1595,16 @@ bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
* Returns:
*
* Type of the instruction. Size of the memory operand is stored in
- * @bytes. If decode failed, MMIO_DECODE_FAILED returned.
+ * @bytes. If decode failed, INSN_MMIO_DECODE_FAILED returned.
*/
-enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
+enum insn_mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
{
- enum mmio_type type = MMIO_DECODE_FAILED;
+ enum insn_mmio_type type = INSN_MMIO_DECODE_FAILED;
*bytes = 0;
if (insn_get_opcode(insn))
- return MMIO_DECODE_FAILED;
+ return INSN_MMIO_DECODE_FAILED;
switch (insn->opcode.bytes[0]) {
case 0x88: /* MOV m8,r8 */
@@ -1613,7 +1613,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0x89: /* MOV m16/m32/m64, r16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_WRITE;
+ type = INSN_MMIO_WRITE;
break;
case 0xc6: /* MOV m8, imm8 */
@@ -1622,7 +1622,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xc7: /* MOV m16/m32/m64, imm16/imm32/imm64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_WRITE_IMM;
+ type = INSN_MMIO_WRITE_IMM;
break;
case 0x8a: /* MOV r8, m8 */
@@ -1631,7 +1631,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0x8b: /* MOV r16/r32/r64, m16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_READ;
+ type = INSN_MMIO_READ;
break;
case 0xa4: /* MOVS m8, m8 */
@@ -1640,7 +1640,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xa5: /* MOVS m16/m32/m64, m16/m32/m64 */
if (!*bytes)
*bytes = insn->opnd_bytes;
- type = MMIO_MOVS;
+ type = INSN_MMIO_MOVS;
break;
case 0x0f: /* Two-byte instruction */
@@ -1651,7 +1651,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xb7: /* MOVZX r32/r64, m16 */
if (!*bytes)
*bytes = 2;
- type = MMIO_READ_ZERO_EXTEND;
+ type = INSN_MMIO_READ_ZERO_EXTEND;
break;
case 0xbe: /* MOVSX r16/r32/r64, m8 */
@@ -1660,7 +1660,7 @@ enum mmio_type insn_decode_mmio(struct insn *insn, int *bytes)
case 0xbf: /* MOVSX r32/r64, m16 */
if (!*bytes)
*bytes = 2;
- type = MMIO_READ_SIGN_EXTEND;
+ type = INSN_MMIO_READ_SIGN_EXTEND;
break;
}
break;
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index a1f9416bf67a..6ff2f56cb0f7 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -10,6 +10,6 @@
*/
SYM_FUNC_START(__iowrite32_copy)
movl %edx,%ecx
- rep movsd
+ rep movsl
RET
SYM_FUNC_END(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index ef3af7ff2c8a..a29b64befb93 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count)
return __memset(s, c, count);
}
EXPORT_SYMBOL(memset);
-
-__visible void *memmove(void *dest, const void *src, size_t n)
-{
- int d0,d1,d2,d3,d4,d5;
- char *ret = dest;
-
- __asm__ __volatile__(
- /* Handle more 16 bytes in loop */
- "cmp $0x10, %0\n\t"
- "jb 1f\n\t"
-
- /* Decide forward/backward copy mode */
- "cmp %2, %1\n\t"
- "jb 2f\n\t"
-
- /*
- * movs instruction have many startup latency
- * so we handle small size by general register.
- */
- "cmp $680, %0\n\t"
- "jb 3f\n\t"
- /*
- * movs instruction is only good for aligned case.
- */
- "mov %1, %3\n\t"
- "xor %2, %3\n\t"
- "and $0xff, %3\n\t"
- "jz 4f\n\t"
- "3:\n\t"
- "sub $0x10, %0\n\t"
-
- /*
- * We gobble 16 bytes forward in each loop.
- */
- "3:\n\t"
- "sub $0x10, %0\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov 1*4(%1), %4\n\t"
- "mov %3, 0*4(%2)\n\t"
- "mov %4, 1*4(%2)\n\t"
- "mov 2*4(%1), %3\n\t"
- "mov 3*4(%1), %4\n\t"
- "mov %3, 2*4(%2)\n\t"
- "mov %4, 3*4(%2)\n\t"
- "lea 0x10(%1), %1\n\t"
- "lea 0x10(%2), %2\n\t"
- "jae 3b\n\t"
- "add $0x10, %0\n\t"
- "jmp 1f\n\t"
-
- /*
- * Handle data forward by movs.
- */
- ".p2align 4\n\t"
- "4:\n\t"
- "mov -4(%1, %0), %3\n\t"
- "lea -4(%2, %0), %4\n\t"
- "shr $2, %0\n\t"
- "rep movsl\n\t"
- "mov %3, (%4)\n\t"
- "jmp 11f\n\t"
- /*
- * Handle data backward by movs.
- */
- ".p2align 4\n\t"
- "6:\n\t"
- "mov (%1), %3\n\t"
- "mov %2, %4\n\t"
- "lea -4(%1, %0), %1\n\t"
- "lea -4(%2, %0), %2\n\t"
- "shr $2, %0\n\t"
- "std\n\t"
- "rep movsl\n\t"
- "mov %3,(%4)\n\t"
- "cld\n\t"
- "jmp 11f\n\t"
-
- /*
- * Start to prepare for backward copy.
- */
- ".p2align 4\n\t"
- "2:\n\t"
- "cmp $680, %0\n\t"
- "jb 5f\n\t"
- "mov %1, %3\n\t"
- "xor %2, %3\n\t"
- "and $0xff, %3\n\t"
- "jz 6b\n\t"
-
- /*
- * Calculate copy position to tail.
- */
- "5:\n\t"
- "add %0, %1\n\t"
- "add %0, %2\n\t"
- "sub $0x10, %0\n\t"
-
- /*
- * We gobble 16 bytes backward in each loop.
- */
- "7:\n\t"
- "sub $0x10, %0\n\t"
-
- "mov -1*4(%1), %3\n\t"
- "mov -2*4(%1), %4\n\t"
- "mov %3, -1*4(%2)\n\t"
- "mov %4, -2*4(%2)\n\t"
- "mov -3*4(%1), %3\n\t"
- "mov -4*4(%1), %4\n\t"
- "mov %3, -3*4(%2)\n\t"
- "mov %4, -4*4(%2)\n\t"
- "lea -0x10(%1), %1\n\t"
- "lea -0x10(%2), %2\n\t"
- "jae 7b\n\t"
- /*
- * Calculate copy position to head.
- */
- "add $0x10, %0\n\t"
- "sub %0, %1\n\t"
- "sub %0, %2\n\t"
-
- /*
- * Move data from 8 bytes to 15 bytes.
- */
- ".p2align 4\n\t"
- "1:\n\t"
- "cmp $8, %0\n\t"
- "jb 8f\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov 1*4(%1), %4\n\t"
- "mov -2*4(%1, %0), %5\n\t"
- "mov -1*4(%1, %0), %1\n\t"
-
- "mov %3, 0*4(%2)\n\t"
- "mov %4, 1*4(%2)\n\t"
- "mov %5, -2*4(%2, %0)\n\t"
- "mov %1, -1*4(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data from 4 bytes to 7 bytes.
- */
- ".p2align 4\n\t"
- "8:\n\t"
- "cmp $4, %0\n\t"
- "jb 9f\n\t"
- "mov 0*4(%1), %3\n\t"
- "mov -1*4(%1, %0), %4\n\t"
- "mov %3, 0*4(%2)\n\t"
- "mov %4, -1*4(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data from 2 bytes to 3 bytes.
- */
- ".p2align 4\n\t"
- "9:\n\t"
- "cmp $2, %0\n\t"
- "jb 10f\n\t"
- "movw 0*2(%1), %%dx\n\t"
- "movw -1*2(%1, %0), %%bx\n\t"
- "movw %%dx, 0*2(%2)\n\t"
- "movw %%bx, -1*2(%2, %0)\n\t"
- "jmp 11f\n\t"
-
- /*
- * Move data for 1 byte.
- */
- ".p2align 4\n\t"
- "10:\n\t"
- "cmp $1, %0\n\t"
- "jb 11f\n\t"
- "movb (%1), %%cl\n\t"
- "movb %%cl, (%2)\n\t"
- ".p2align 4\n\t"
- "11:"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2),
- "=r" (d3),"=r" (d4), "=r"(d5)
- :"0" (n),
- "1" (src),
- "2" (dest)
- :"memory");
-
- return ret;
-
-}
-EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index dd8cd8831251..8f95fb267caa 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -8,14 +8,7 @@
#include <asm/alternative.h>
#include <asm/export.h>
-.pushsection .noinstr.text, "ax"
-
-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
+.section .noinstr.text, "ax"
/*
* memcpy - Copy a memory block.
@@ -27,36 +20,29 @@
*
* Output:
* rax original destination
+ *
+ * The FSRM alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep movsb' itself is small enough to replace the call, but the
+ * two register moves blow up the code. And one of them is "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
*/
SYM_TYPED_FUNC_START(__memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
+ ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
movq %rdi, %rax
movq %rdx, %rcx
- shrq $3, %rcx
- andl $7, %edx
- rep movsq
- movl %edx, %ecx
rep movsb
RET
SYM_FUNC_END(__memcpy)
EXPORT_SYMBOL(__memcpy)
-SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
+SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy)
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-SYM_FUNC_START_LOCAL(memcpy_erms)
- movq %rdi, %rax
- movq %rdx, %rcx
- rep movsb
- RET
-SYM_FUNC_END(memcpy_erms)
-
SYM_FUNC_START_LOCAL(memcpy_orig)
movq %rdi, %rax
@@ -184,4 +170,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig)
RET
SYM_FUNC_END(memcpy_orig)
-.popsection
diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S
new file mode 100644
index 000000000000..0588b2c0fc95
--- /dev/null
+++ b/arch/x86/lib/memmove_32.S
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/linkage.h>
+#include <asm/export.h>
+
+SYM_FUNC_START(memmove)
+/*
+ * void *memmove(void *dest_in, const void *src_in, size_t n)
+ * -mregparm=3 passes these in registers:
+ * dest_in: %eax
+ * src_in: %edx
+ * n: %ecx
+ * See also: arch/x86/entry/calling.h for description of the calling convention.
+ *
+ * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src
+ * in %esi.
+ */
+.set dest_in, %eax
+.set dest, %edi
+.set src_in, %edx
+.set src, %esi
+.set n, %ecx
+.set tmp0, %edx
+.set tmp0w, %dx
+.set tmp1, %ebx
+.set tmp1w, %bx
+.set tmp2, %eax
+.set tmp3b, %cl
+
+/*
+ * Save all callee-saved registers, because this function is going to clobber
+ * all of them:
+ */
+ pushl %ebp
+ movl %esp, %ebp // set standard frame pointer
+
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+ pushl %eax // save 'dest_in' parameter [eax] as the return value
+
+ movl src_in, src
+ movl dest_in, dest
+
+ /* Handle more 16 bytes in loop */
+ cmpl $0x10, n
+ jb .Lmove_16B
+
+ /* Decide forward/backward copy mode */
+ cmpl dest, src
+ jb .Lbackwards_header
+
+ /*
+ * movs instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ cmpl $680, n
+ jb .Ltoo_small_forwards
+ /* movs instruction is only good for aligned case. */
+ movl src, tmp0
+ xorl dest, tmp0
+ andl $0xff, tmp0
+ jz .Lforward_movs
+.Ltoo_small_forwards:
+ subl $0x10, n
+
+ /* We gobble 16 bytes forward in each loop. */
+.Lmove_16B_forwards_loop:
+ subl $0x10, n
+ movl 0*4(src), tmp0
+ movl 1*4(src), tmp1
+ movl tmp0, 0*4(dest)
+ movl tmp1, 1*4(dest)
+ movl 2*4(src), tmp0
+ movl 3*4(src), tmp1
+ movl tmp0, 2*4(dest)
+ movl tmp1, 3*4(dest)
+ leal 0x10(src), src
+ leal 0x10(dest), dest
+ jae .Lmove_16B_forwards_loop
+ addl $0x10, n
+ jmp .Lmove_16B
+
+ /* Handle data forward by movs. */
+.p2align 4
+.Lforward_movs:
+ movl -4(src, n), tmp0
+ leal -4(dest, n), tmp1
+ shrl $2, n
+ rep movsl
+ movl tmp0, (tmp1)
+ jmp .Ldone
+
+ /* Handle data backward by movs. */
+.p2align 4
+.Lbackwards_movs:
+ movl (src), tmp0
+ movl dest, tmp1
+ leal -4(src, n), src
+ leal -4(dest, n), dest
+ shrl $2, n
+ std
+ rep movsl
+ movl tmp0,(tmp1)
+ cld
+ jmp .Ldone
+
+ /* Start to prepare for backward copy. */
+.p2align 4
+.Lbackwards_header:
+ cmpl $680, n
+ jb .Ltoo_small_backwards
+ movl src, tmp0
+ xorl dest, tmp0
+ andl $0xff, tmp0
+ jz .Lbackwards_movs
+
+ /* Calculate copy position to tail. */
+.Ltoo_small_backwards:
+ addl n, src
+ addl n, dest
+ subl $0x10, n
+
+ /* We gobble 16 bytes backward in each loop. */
+.Lmove_16B_backwards_loop:
+ subl $0x10, n
+
+ movl -1*4(src), tmp0
+ movl -2*4(src), tmp1
+ movl tmp0, -1*4(dest)
+ movl tmp1, -2*4(dest)
+ movl -3*4(src), tmp0
+ movl -4*4(src), tmp1
+ movl tmp0, -3*4(dest)
+ movl tmp1, -4*4(dest)
+ leal -0x10(src), src
+ leal -0x10(dest), dest
+ jae .Lmove_16B_backwards_loop
+ /* Calculate copy position to head. */
+ addl $0x10, n
+ subl n, src
+ subl n, dest
+
+ /* Move data from 8 bytes to 15 bytes. */
+.p2align 4
+.Lmove_16B:
+ cmpl $8, n
+ jb .Lmove_8B
+ movl 0*4(src), tmp0
+ movl 1*4(src), tmp1
+ movl -2*4(src, n), tmp2
+ movl -1*4(src, n), src
+
+ movl tmp0, 0*4(dest)
+ movl tmp1, 1*4(dest)
+ movl tmp2, -2*4(dest, n)
+ movl src, -1*4(dest, n)
+ jmp .Ldone
+
+ /* Move data from 4 bytes to 7 bytes. */
+.p2align 4
+.Lmove_8B:
+ cmpl $4, n
+ jb .Lmove_4B
+ movl 0*4(src), tmp0
+ movl -1*4(src, n), tmp1
+ movl tmp0, 0*4(dest)
+ movl tmp1, -1*4(dest, n)
+ jmp .Ldone
+
+ /* Move data from 2 bytes to 3 bytes. */
+.p2align 4
+.Lmove_4B:
+ cmpl $2, n
+ jb .Lmove_1B
+ movw 0*2(src), tmp0w
+ movw -1*2(src, n), tmp1w
+ movw tmp0w, 0*2(dest)
+ movw tmp1w, -1*2(dest, n)
+ jmp .Ldone
+
+ /* Move data for 1 byte. */
+.p2align 4
+.Lmove_1B:
+ cmpl $1, n
+ jb .Ldone
+ movb (src), tmp3b
+ movb tmp3b, (dest)
+.p2align 4
+.Ldone:
+ popl dest_in // restore 'dest_in' [eax] as the return value
+ /* Restore all callee-saved registers: */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ RET
+SYM_FUNC_END(memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 724bbf83eb5b..02661861e5dd 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -13,6 +13,8 @@
#undef memmove
+.section .noinstr.text, "ax"
+
/*
* Implement memmove(). This can handle overlap between src and dst.
*
@@ -213,5 +215,5 @@ SYM_FUNC_START(__memmove)
SYM_FUNC_END(__memmove)
EXPORT_SYMBOL(__memmove)
-SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
+SYM_FUNC_ALIAS(memmove, __memmove)
EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index fc9ffd3ff3b2..7c59a704c458 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,6 +6,8 @@
#include <asm/alternative.h>
#include <asm/export.h>
+.section .noinstr.text, "ax"
+
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -16,56 +18,31 @@
* rdx count (bytes)
*
* rax original destination
+ *
+ * The FSRS alternative should be done inline (avoiding the call and
+ * the disgusting return handling), but that would require some help
+ * from the compiler for better calling conventions.
+ *
+ * The 'rep stosb' itself is small enough to replace the call, but all
+ * the register moves blow up the code. And two of them are "needed"
+ * only for the return value that is the same as the source input,
+ * which the compiler could/should do much better anyway.
*/
SYM_FUNC_START(__memset)
- /*
- * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
- * to use it when possible. If not available, use fast string instructions.
- *
- * Otherwise, use original memset function.
- */
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memset_erms", X86_FEATURE_ERMS
+ ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
movq %rdi,%r9
+ movb %sil,%al
movq %rdx,%rcx
- andl $7,%edx
- shrq $3,%rcx
- /* expand byte value */
- movzbl %sil,%esi
- movabs $0x0101010101010101,%rax
- imulq %rsi,%rax
- rep stosq
- movl %edx,%ecx
rep stosb
movq %r9,%rax
RET
SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
-SYM_FUNC_ALIAS_WEAK(memset, __memset)
+SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset)
-/*
- * ISO C memset - set a memory block to a byte value. This function uses
- * enhanced rep stosb to override the fast string function.
- * The code is simpler and shorter than the fast string function as well.
- *
- * rdi destination
- * rsi value (char)
- * rdx count (bytes)
- *
- * rax original destination
- */
-SYM_FUNC_START_LOCAL(memset_erms)
- movq %rdi,%r9
- movb %sil,%al
- movq %rdx,%rcx
- rep stosb
- movq %r9,%rax
- RET
-SYM_FUNC_END(memset_erms)
-
SYM_FUNC_START_LOCAL(memset_orig)
movq %rdi,%r10
diff --git a/arch/x86/lib/misc.c b/arch/x86/lib/misc.c
index a018ec4fba53..92cd8ecc3a2c 100644
--- a/arch/x86/lib/misc.c
+++ b/arch/x86/lib/misc.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+#include <asm/misc.h>
+
/*
* Count the digits of @val including a possible sign.
*
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index b7dfd60243b7..3062d09a776d 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -33,22 +33,20 @@
* as they get called from within inline assembly.
*/
-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_BX
-#endif
+.macro check_range size:req
+.if IS_ENABLED(CONFIG_X86_64)
+ mov %rcx, %rbx
+ sar $63, %rbx
+ or %rbx, %rcx
+.else
+ cmp $TASK_SIZE_MAX-\size+1, %ecx
+ jae .Lbad_put_user
+.endif
+.endm
.text
SYM_FUNC_START(__put_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
- ENDBR
+ check_range size=1
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -56,54 +54,81 @@ SYM_INNER_LABEL(__put_user_nocheck_1, SYM_L_GLOBAL)
RET
SYM_FUNC_END(__put_user_1)
EXPORT_SYMBOL(__put_user_1)
+
+SYM_FUNC_START(__put_user_nocheck_1)
+ ENDBR
+ ASM_STAC
+2: movb %al,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
SYM_FUNC_START(__put_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_2, SYM_L_GLOBAL)
- ENDBR
+ check_range size=2
ASM_STAC
-2: movw %ax,(%_ASM_CX)
+3: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_2)
EXPORT_SYMBOL(__put_user_2)
+
+SYM_FUNC_START(__put_user_nocheck_2)
+ ENDBR
+ ASM_STAC
+4: movw %ax,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
SYM_FUNC_START(__put_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_4, SYM_L_GLOBAL)
- ENDBR
+ check_range size=4
ASM_STAC
-3: movl %eax,(%_ASM_CX)
+5: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_4)
EXPORT_SYMBOL(__put_user_4)
+
+SYM_FUNC_START(__put_user_nocheck_4)
+ ENDBR
+ ASM_STAC
+6: movl %eax,(%_ASM_CX)
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
SYM_FUNC_START(__put_user_8)
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
-SYM_INNER_LABEL(__put_user_nocheck_8, SYM_L_GLOBAL)
- ENDBR
+ check_range size=8
ASM_STAC
-4: mov %_ASM_AX,(%_ASM_CX)
+7: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
-5: movl %edx,4(%_ASM_CX)
+8: movl %edx,4(%_ASM_CX)
#endif
xor %ecx,%ecx
ASM_CLAC
RET
SYM_FUNC_END(__put_user_8)
EXPORT_SYMBOL(__put_user_8)
+
+SYM_FUNC_START(__put_user_nocheck_8)
+ ENDBR
+ ASM_STAC
+9: mov %_ASM_AX,(%_ASM_CX)
+#ifdef CONFIG_X86_32
+10: movl %edx,4(%_ASM_CX)
+#endif
+ xor %ecx,%ecx
+ ASM_CLAC
+ RET
+SYM_FUNC_END(__put_user_nocheck_8)
EXPORT_SYMBOL(__put_user_nocheck_8)
SYM_CODE_START_LOCAL(.Lbad_put_user_clac)
@@ -113,10 +138,15 @@ SYM_CODE_START_LOCAL(.Lbad_put_user_clac)
RET
SYM_CODE_END(.Lbad_put_user_clac)
- _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(1b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(2b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(3b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(4b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(5b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(6b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(7b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(9b, .Lbad_put_user_clac)
#ifdef CONFIG_X86_32
- _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(8b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(10b, .Lbad_put_user_clac)
#endif
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 073289a55f84..b3b1e376dce8 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -5,24 +5,27 @@
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
+#include <asm/asm-offsets.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/percpu.h>
#include <asm/frame.h>
.section .text.__x86.indirect_thunk
-.macro RETPOLINE reg
+
+.macro POLINE reg
ANNOTATE_INTRA_FUNCTION_CALL
call .Ldo_rop_\@
-.Lspec_trap_\@:
- UNWIND_HINT_EMPTY
- pause
- lfence
- jmp .Lspec_trap_\@
+ int3
.Ldo_rop_\@:
mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
+.endm
+
+.macro RETPOLINE reg
+ POLINE \reg
RET
.endm
@@ -30,7 +33,7 @@
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
@@ -52,7 +55,6 @@ SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
*/
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
-#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_START(__x86_indirect_thunk_array)
@@ -64,10 +66,65 @@ SYM_CODE_START(__x86_indirect_thunk_array)
.align RETPOLINE_THUNK_SIZE
SYM_CODE_END(__x86_indirect_thunk_array)
-#define GEN(reg) EXPORT_THUNK(reg)
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+.macro CALL_THUNK reg
+ .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+
+ CALL_DEPTH_ACCOUNT
+ POLINE \reg
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+.endm
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) CALL_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_call_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_call_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+
+.macro JUMP_THUNK reg
+ .align RETPOLINE_THUNK_SIZE
+
+SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL)
+ UNWIND_HINT_UNDEFINED
+ ANNOTATE_NOENDBR
+ POLINE \reg
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+.endm
+
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_START(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) JUMP_THUNK reg
#include <asm/GEN-for-each-reg.h>
#undef GEN
+ .align RETPOLINE_THUNK_SIZE
+SYM_CODE_END(__x86_indirect_jump_thunk_array)
+
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_jump_thunk_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+#undef GEN
+#endif
/*
* This function name is magical and is used by -mfunction-return=thunk-extern
* for the compiler to generate JMPs to it.
@@ -87,8 +144,8 @@ SYM_CODE_END(__x86_indirect_thunk_array)
*/
.align 64
.skip 63, 0xcc
-SYM_FUNC_START_NOALIGN(zen_untrain_ret);
-
+SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE)
+ ANNOTATE_NOENDBR
/*
* As executed from zen_untrain_ret, this is:
*
@@ -140,3 +197,37 @@ __EXPORT_THUNK(zen_untrain_ret)
EXPORT_SYMBOL(__x86_return_thunk)
#endif /* CONFIG_RETHUNK */
+
+#ifdef CONFIG_CALL_DEPTH_TRACKING
+
+ .align 64
+SYM_FUNC_START(__x86_return_skl)
+ ANNOTATE_NOENDBR
+ /*
+ * Keep the hotpath in a 16byte I-fetch for the non-debug
+ * case.
+ */
+ CALL_THUNKS_DEBUG_INC_RETS
+ shlq $5, PER_CPU_VAR(pcpu_hot + X86_call_depth)
+ jz 1f
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+1:
+ CALL_THUNKS_DEBUG_INC_STUFFS
+ .rept 16
+ ANNOTATE_INTRA_FUNCTION_CALL
+ call 2f
+ int3
+2:
+ .endr
+ add $(8*16), %rsp
+
+ CREDIT_CALL_DEPTH
+
+ ANNOTATE_UNRET_SAFE
+ ret
+ int3
+SYM_FUNC_END(__x86_return_skl)
+
+#endif /* CONFIG_CALL_DEPTH_TRACKING */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 6c1f8ac5e721..003d90138e20 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -45,7 +45,11 @@ EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);
long __copy_user_flushcache(void *dst, const void __user *src, unsigned size)
{
unsigned long flushed, dest = (unsigned long) dst;
- long rc = __copy_user_nocache(dst, src, size, 0);
+ long rc;
+
+ stac();
+ rc = __copy_user_nocache(dst, src, size);
+ clac();
/*
* __copy_user_nocache() uses non-temporal stores for the bulk
@@ -136,13 +140,4 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
}
}
EXPORT_SYMBOL_GPL(__memcpy_flushcache);
-
-void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
- size_t len)
-{
- char *from = kmap_atomic(page);
-
- memcpy_flushcache(to, from + offset, len);
- kunmap_atomic(from);
-}
#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index d12d1358f96d..5168ee0360b2 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1047,6 +1047,7 @@ GrpTable: Grp6
3: LTR Ew
4: VERR Ew
5: VERW Ew
+6: LKGS Ew (F2)
EndTable
GrpTable: Grp7
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 6c2f1b76a0b6..e91500a80963 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -9,22 +9,67 @@
#include <asm/cpu_entry_area.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
+#include <asm/kasan.h>
+#include <asm/setup.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
-#endif
-#ifdef CONFIG_X86_32
+static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
+
+static __always_inline unsigned int cea_offset(unsigned int cpu)
+{
+ return per_cpu(_cea_offset, cpu);
+}
+
+static __init void init_cea_offsets(void)
+{
+ unsigned int max_cea;
+ unsigned int i, j;
+
+ if (!kaslr_enabled()) {
+ for_each_possible_cpu(i)
+ per_cpu(_cea_offset, i) = i;
+ return;
+ }
+
+ max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE;
+
+ /* O(sodding terrible) */
+ for_each_possible_cpu(i) {
+ unsigned int cea;
+
+again:
+ cea = get_random_u32_below(max_cea);
+
+ for_each_possible_cpu(j) {
+ if (cea_offset(j) == cea)
+ goto again;
+
+ if (i == j)
+ break;
+ }
+
+ per_cpu(_cea_offset, i) = cea;
+ }
+}
+#else /* !X86_64 */
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
+
+static __always_inline unsigned int cea_offset(unsigned int cpu)
+{
+ return cpu;
+}
+static inline void init_cea_offsets(void) { }
#endif
/* Is called from entry code, so must be noinstr */
noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
- unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return (struct cpu_entry_area *) va;
@@ -138,20 +183,19 @@ static void __init setup_cpu_entry_area(unsigned int cpu)
pgprot_t tss_prot = PAGE_KERNEL_RO;
#else
/*
- * On native 32-bit systems, the GDT cannot be read-only because
+ * On 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the
* GDT is read-only, that will triple fault. The TSS cannot be
* read-only because the CPU writes to it on task switches.
- *
- * On Xen PV, the GDT must be read-only because the hypervisor
- * requires it.
*/
- pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
- PAGE_KERNEL_RO : PAGE_KERNEL;
+ pgprot_t gdt_prot = PAGE_KERNEL;
pgprot_t tss_prot = PAGE_KERNEL;
#endif
+ kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE,
+ early_cpu_to_node(cpu));
+
cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
cea_map_percpu_pages(&cea->entry_stack_page,
@@ -205,7 +249,6 @@ static __init void setup_cpu_entry_area_ptes(void)
/* The +1 is for the readonly IDT: */
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
- BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE;
@@ -221,6 +264,8 @@ void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
+ init_cea_offsets();
+
setup_cpu_entry_area_ptes();
for_each_possible_cpu(cpu)
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 092ea436c7e6..b43301cb2a80 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -71,6 +71,5 @@ static void __exit pt_dump_debug_exit(void)
module_init(pt_dump_debug_init);
module_exit(pt_dump_debug_exit);
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 60814e110a54..271dcb2deabc 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -130,10 +130,36 @@ static bool ex_handler_fprestore(const struct exception_table_entry *fixup,
return true;
}
+/*
+ * On x86-64, we end up being imprecise with 'access_ok()', and allow
+ * non-canonical user addresses to make the range comparisons simpler,
+ * and to not have to worry about LAM being enabled.
+ *
+ * In fact, we allow up to one page of "slop" at the sign boundary,
+ * which means that we can do access_ok() by just checking the sign
+ * of the pointer for the common case of having a small access size.
+ */
+static bool gp_fault_address_ok(unsigned long fault_address)
+{
+#ifdef CONFIG_X86_64
+ /* Is it in the "user space" part of the non-canonical space? */
+ if (valid_user_address(fault_address))
+ return true;
+
+ /* .. or just above it? */
+ fault_address -= PAGE_SIZE;
+ if (valid_user_address(fault_address))
+ return true;
+#endif
+ return false;
+}
+
static bool ex_handler_uaccess(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr)
+ struct pt_regs *regs, int trapnr,
+ unsigned long fault_address)
{
- WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
+ WARN_ONCE(trapnr == X86_TRAP_GP && !gp_fault_address_ok(fault_address),
+ "General protection fault in user access. Non-canonical address?");
return ex_handler_default(fixup, regs);
}
@@ -189,10 +215,12 @@ static bool ex_handler_imm_reg(const struct exception_table_entry *fixup,
}
static bool ex_handler_ucopy_len(const struct exception_table_entry *fixup,
- struct pt_regs *regs, int trapnr, int reg, int imm)
+ struct pt_regs *regs, int trapnr,
+ unsigned long fault_address,
+ int reg, int imm)
{
regs->cx = imm * regs->cx + *pt_regs_nr(regs, reg);
- return ex_handler_uaccess(fixup, regs, trapnr);
+ return ex_handler_uaccess(fixup, regs, trapnr, fault_address);
}
int ex_get_fixup_type(unsigned long ip)
@@ -238,7 +266,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
case EX_TYPE_FAULT_MCE_SAFE:
return ex_handler_fault(e, regs, trapnr);
case EX_TYPE_UACCESS:
- return ex_handler_uaccess(e, regs, trapnr);
+ return ex_handler_uaccess(e, regs, trapnr, fault_addr);
case EX_TYPE_COPY:
return ex_handler_copy(e, regs, trapnr);
case EX_TYPE_CLEAR_FS:
@@ -269,7 +297,7 @@ int fixup_exception(struct pt_regs *regs, int trapnr, unsigned long error_code,
case EX_TYPE_FAULT_SGX:
return ex_handler_sgx(e, regs, trapnr);
case EX_TYPE_UCOPY_LEN:
- return ex_handler_ucopy_len(e, regs, trapnr, reg, imm);
+ return ex_handler_ucopy_len(e, regs, trapnr, fault_addr, reg, imm);
case EX_TYPE_ZEROPAD:
return ex_handler_zeropad(e, regs, fault_addr);
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 7b0d4ab894c8..e4399983c50c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -19,6 +19,7 @@
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/
#include <linux/mm_types.h>
+#include <linux/mm.h> /* find_and_lock_vma() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
@@ -260,7 +261,7 @@ static noinline int vmalloc_fault(unsigned long address)
}
NOKPROBE_SYMBOL(vmalloc_fault);
-static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
unsigned long addr;
@@ -284,27 +285,6 @@ static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end)
}
}
-void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
-{
- __arch_sync_kernel_mappings(start, end);
-#ifdef CONFIG_KMSAN
- /*
- * KMSAN maintains two additional metadata page mappings for the
- * [VMALLOC_START, VMALLOC_END) range. These mappings start at
- * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and
- * have to be synced together with the vmalloc memory mapping.
- */
- if (start >= VMALLOC_START && end < VMALLOC_END) {
- __arch_sync_kernel_mappings(
- start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START,
- end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START);
- __arch_sync_kernel_mappings(
- start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START,
- end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START);
- }
-#endif
-}
-
static bool low_pfn(unsigned long pfn)
{
return pfn < max_low_pfn;
@@ -1354,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
}
#endif
+#ifdef CONFIG_PER_VMA_LOCK
+ if (!(flags & FAULT_FLAG_USER))
+ goto lock_mmap;
+
+ vma = lock_vma_under_rcu(mm, address);
+ if (!vma)
+ goto lock_mmap;
+
+ if (unlikely(access_error(error_code, vma))) {
+ vma_end_read(vma);
+ goto lock_mmap;
+ }
+ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
+ vma_end_read(vma);
+
+ if (!(fault & VM_FAULT_RETRY)) {
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ goto done;
+ }
+ count_vm_vma_lock_event(VMA_LOCK_RETRY);
+
+ /* Quick path to respond to signals */
+ if (fault_signal_pending(fault, regs)) {
+ if (!user_mode(regs))
+ kernelmode_fixup_or_oops(regs, error_code, address,
+ SIGBUS, BUS_ADRERR,
+ ARCH_DEFAULT_PKEY);
+ return;
+ }
+lock_mmap:
+#endif /* CONFIG_PER_VMA_LOCK */
+
/*
* Kernel-mode access to the user address space should only occur
* on well-defined single instructions listed in the exception
@@ -1454,6 +1466,9 @@ good_area:
}
mmap_read_unlock(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+done:
+#endif
if (likely(!(fault & VM_FAULT_ERROR)))
return;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9121bc1b9453..8192452d1d2d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <asm/set_memory.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/init.h>
#include <asm/page.h>
@@ -26,6 +27,7 @@
#include <asm/pti.h>
#include <asm/text-patching.h>
#include <asm/memtype.h>
+#include <asm/paravirt.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -260,6 +262,24 @@ static void __init probe_page_size_mask(void)
}
}
+#define INTEL_MATCH(_model) { .vendor = X86_VENDOR_INTEL, \
+ .family = 6, \
+ .model = _model, \
+ }
+/*
+ * INVLPG may not properly flush Global entries
+ * on these CPUs when PCIDs are enabled.
+ */
+static const struct x86_cpu_id invlpg_miss_ids[] = {
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE ),
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE_L ),
+ INTEL_MATCH(INTEL_FAM6_ALDERLAKE_N ),
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE ),
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_P),
+ INTEL_MATCH(INTEL_FAM6_RAPTORLAKE_S),
+ {}
+};
+
static void setup_pcid(void)
{
if (!IS_ENABLED(CONFIG_X86_64))
@@ -268,6 +288,12 @@ static void setup_pcid(void)
if (!boot_cpu_has(X86_FEATURE_PCID))
return;
+ if (x86_match_cpu(invlpg_miss_ids)) {
+ pr_info("Incomplete global flushes, disabling PCID");
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ return;
+ }
+
if (boot_cpu_has(X86_FEATURE_PGE)) {
/*
* This can't be cr4_set_bits_and_update_boot() -- the
@@ -801,9 +827,12 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
- poking_mm = copy_init_mm();
+ poking_mm = mm_alloc();
BUG_ON(!poking_mm);
+ /* Xen PV guests need the PGD to be pinned. */
+ paravirt_enter_mmap(poking_mm);
+
/*
* Randomize the poking address, but make sure that the following page
* will be mapped at the same PMD. We need 2 pages, so find space for 3,
@@ -1044,6 +1073,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
+#ifdef CONFIG_ADDRESS_MASKING
+DEFINE_PER_CPU(u64, tlbstate_untag_mask);
+EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
+#endif
+
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3f040c6e5d13..a190aae8ceaf 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1416,47 +1416,6 @@ void mark_rodata_ro(void)
debug_checkwx();
}
-int kern_addr_valid(unsigned long addr)
-{
- unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (above != 0 && above != -1UL)
- return 0;
-
- pgd = pgd_offset_k(addr);
- if (pgd_none(*pgd))
- return 0;
-
- p4d = p4d_offset(pgd, addr);
- if (!p4d_present(*p4d))
- return 0;
-
- pud = pud_offset(p4d, addr);
- if (!pud_present(*pud))
- return 0;
-
- if (pud_large(*pud))
- return pfn_valid(pud_pfn(*pud));
-
- pmd = pmd_offset(pud, addr);
- if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return pfn_valid(pmd_pfn(*pmd));
-
- pte = pte_offset_kernel(pmd, addr);
- if (pte_none(*pte))
- return 0;
-
- return pfn_valid(pte_pfn(*pte));
-}
-
/*
* Block size is the minimum amount of memory which can be hotplugged or
* hotremoved. It must be power of two and must be equal or larger than
@@ -1533,72 +1492,44 @@ static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
-static int __meminit vmemmap_populate_hugepages(unsigned long start,
- unsigned long end, int node, struct vmem_altmap *altmap)
+void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
+ unsigned long addr, unsigned long next)
{
- unsigned long addr;
- unsigned long next;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
-
- for (addr = start; addr < end; addr = next) {
- next = pmd_addr_end(addr, end);
-
- pgd = vmemmap_pgd_populate(addr, node);
- if (!pgd)
- return -ENOMEM;
-
- p4d = vmemmap_p4d_populate(pgd, addr, node);
- if (!p4d)
- return -ENOMEM;
-
- pud = vmemmap_pud_populate(p4d, addr, node);
- if (!pud)
- return -ENOMEM;
-
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd)) {
- void *p;
-
- p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
- if (p) {
- pte_t entry;
-
- entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
- PAGE_KERNEL_LARGE);
- set_pmd(pmd, __pmd(pte_val(entry)));
+ pte_t entry;
+
+ entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE);
+ set_pmd(pmd, __pmd(pte_val(entry)));
+
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
- /* check to see if we have contiguous blocks */
- if (p_end != p || node_start != node) {
- if (p_start)
- pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n",
- addr_start, addr_end-1, p_start, p_end-1, node_start);
- addr_start = addr;
- node_start = node;
- p_start = p;
- }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
- addr_end = addr + PMD_SIZE;
- p_end = p + PMD_SIZE;
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+}
- if (!IS_ALIGNED(addr, PMD_SIZE) ||
- !IS_ALIGNED(next, PMD_SIZE))
- vmemmap_use_new_sub_pmd(addr, next);
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+ unsigned long addr, unsigned long next)
+{
+ int large = pmd_large(*pmd);
- continue;
- } else if (altmap)
- return -ENOMEM; /* no fallback */
- } else if (pmd_large(*pmd)) {
- vmemmap_verify((pte_t *)pmd, node, addr, next);
- vmemmap_use_sub_pmd(addr, next);
- continue;
- }
- if (vmemmap_populate_basepages(addr, next, node, NULL))
- return -ENOMEM;
+ if (pmd_large(*pmd)) {
+ vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
}
- return 0;
+
+ return large;
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 78c5bc654cff..aa7d279321ea 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -116,6 +116,11 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
+ if (x86_platform.hyper.is_private_mmio(addr)) {
+ desc->flags |= IORES_MAP_ENCRYPTED;
+ return;
+ }
+
if (!IS_ENABLED(CONFIG_EFI))
return;
@@ -217,9 +222,15 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
- phys_addr &= PHYSICAL_PAGE_MASK;
+ phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
+ /*
+ * Mask out any bits not part of the actual physical
+ * address, like memory encryption bits.
+ */
+ phys_addr &= PHYSICAL_PAGE_MASK;
+
retval = memtype_reserve(phys_addr, (u64)phys_addr + size,
pcm, &new_pcm);
if (retval) {
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index e7b9b464a82f..0302491d799d 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -316,10 +316,33 @@ void __init kasan_early_init(void)
kasan_map_early_shadow(init_top_pgt);
}
+static unsigned long kasan_mem_to_shadow_align_down(unsigned long va)
+{
+ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
+
+ return round_down(shadow, PAGE_SIZE);
+}
+
+static unsigned long kasan_mem_to_shadow_align_up(unsigned long va)
+{
+ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
+
+ return round_up(shadow, PAGE_SIZE);
+}
+
+void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
+{
+ unsigned long shadow_start, shadow_end;
+
+ shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va);
+ shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size);
+ kasan_populate_shadow(shadow_start, shadow_end, nid);
+}
+
void __init kasan_init(void)
{
+ unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end;
int i;
- void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
@@ -360,16 +383,10 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
- shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
- shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
- shadow_cpu_entry_begin = (void *)round_down(
- (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE);
-
- shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
- CPU_ENTRY_AREA_MAP_SIZE);
- shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
- shadow_cpu_entry_end = (void *)round_up(
- (unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
+ shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
+ shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU);
+ shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
@@ -391,12 +408,18 @@ void __init kasan_init(void)
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)VMALLOC_END + 1),
- shadow_cpu_entry_begin);
+ (void *)shadow_cea_begin);
- kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
- (unsigned long)shadow_cpu_entry_end, 0);
+ /*
+ * Populate the shadow for the shared portion of the CPU entry area.
+ * Shadows for the per-CPU areas are mapped on-demand, as each CPU's
+ * area is randomly placed somewhere in the 512GiB range and mapping
+ * the entire 512GiB range is prohibitively expensive.
+ */
+ kasan_populate_shadow(shadow_cea_begin,
+ shadow_cea_per_cpu_begin, 0);
- kasan_populate_early_shadow(shadow_cpu_entry_end,
+ kasan_populate_early_shadow((void *)shadow_cea_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 557f0fe25dff..37db264866b6 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -172,10 +172,10 @@ void __meminit init_trampoline_kaslr(void)
set_p4d(p4d_tramp,
__p4d(_KERNPG_TABLE | __pa(pud_page_tramp)));
- set_pgd(&trampoline_pgd_entry,
- __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+ trampoline_pgd_entry =
+ __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp));
} else {
- set_pgd(&trampoline_pgd_entry,
- __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+ trampoline_pgd_entry =
+ __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
}
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index d3efbc5b3449..9f82019179e1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -62,7 +62,13 @@ struct kmmio_context {
int active;
};
-static DEFINE_SPINLOCK(kmmio_lock);
+/*
+ * The kmmio_lock is taken in int3 context, which is treated as NMI context.
+ * This causes lockdep to complain about it bein in both NMI and normal
+ * context. Hide it from lockdep, as it should not have any other locks
+ * taken under it, and this is only enabled for debugging mmio anyway.
+ */
+static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED;
/* Protected by kmmio_lock */
unsigned int kmmio_count;
@@ -240,15 +246,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
page_base &= page_level_mask(l);
/*
- * Preemption is now disabled to prevent process switch during
- * single stepping. We can only handle one active kmmio trace
+ * Hold the RCU read lock over single stepping to avoid looking
+ * up the probe and kmmio_fault_page again. The rcu_read_lock_sched()
+ * also disables preemption and prevents process switch during
+ * the single stepping. We can only handle one active kmmio trace
* per cpu, so ensure that we finish it before something else
- * gets to run. We also hold the RCU read lock over single
- * stepping to avoid looking up the probe and kmmio_fault_page
- * again.
+ * gets to run.
*/
- preempt_disable();
- rcu_read_lock();
+ rcu_read_lock_sched_notrace();
faultpage = get_kmmio_fault_page(page_base);
if (!faultpage) {
@@ -317,8 +322,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
return 1; /* fault handled */
no_kmmio:
- rcu_read_unlock();
- preempt_enable_no_resched();
+ rcu_read_unlock_sched_notrace();
return ret;
}
@@ -346,10 +350,10 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
ctx->probe->post_handler(ctx->probe, condition, regs);
/* Prevent racing against release_kmmio_fault_page(). */
- spin_lock(&kmmio_lock);
+ arch_spin_lock(&kmmio_lock);
if (ctx->fpage->count)
arm_kmmio_fault_page(ctx->fpage);
- spin_unlock(&kmmio_lock);
+ arch_spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
@@ -357,8 +361,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
/* These were acquired in kmmio_handler(). */
ctx->active--;
BUG_ON(ctx->active);
- rcu_read_unlock();
- preempt_enable_no_resched();
+ rcu_read_unlock_sched_notrace();
/*
* if somebody else is singlestepping across a probe point, flags
@@ -440,7 +443,8 @@ int register_kmmio_probe(struct kmmio_probe *p)
unsigned int l;
pte_t *pte;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
if (get_kmmio_probe(addr)) {
ret = -EEXIST;
goto out;
@@ -460,7 +464,9 @@ int register_kmmio_probe(struct kmmio_probe *p)
size += page_level_size(l);
}
out:
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
+
/*
* XXX: What should I do here?
* Here was a call to global_flush_tlb(), but it does not exist
@@ -494,7 +500,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
while (f) {
if (!f->count) {
list_del_rcu(&f->list);
@@ -506,7 +513,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
}
f = *prevp;
}
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
/* This is the real RCU destroy call. */
call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
@@ -540,14 +548,16 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
if (!pte)
return;
- spin_lock_irqsave(&kmmio_lock, flags);
+ local_irq_save(flags);
+ arch_spin_lock(&kmmio_lock);
while (size < size_lim) {
release_kmmio_fault_page(addr + size, &release_list);
size += page_level_size(l);
}
list_del_rcu(&p->list);
kmmio_count--;
- spin_unlock_irqrestore(&kmmio_lock, flags);
+ arch_spin_unlock(&kmmio_lock);
+ local_irq_restore(flags);
if (!release_list)
return;
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 9c4d8dbcb129..e0b51c09109f 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -513,10 +513,14 @@ void __init mem_encrypt_free_decrypted_mem(void)
npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
/*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
+ * If the unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it. Base the
+ * re-encryption on the same condition used for the decryption in
+ * sme_postprocess_startup(). Higher level abstractions, such as
+ * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM
+ * using vTOM, where sme_me_mask is always zero.
*/
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (sme_me_mask) {
r = set_memory_encrypted(vaddr, npages);
if (r) {
pr_warn("failed to free unused decrypted pages\n");
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 9de3d900bc92..e25288ee33c2 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -26,7 +26,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* RCX - virtual address of the encryption workarea, including:
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
- * - intermediate copy buffer (PMD_PAGE_SIZE)
+ * - intermediate copy buffer (PMD_SIZE)
* R8 - physical address of the pagetables to use for encryption
*/
@@ -123,7 +123,7 @@ SYM_FUNC_START(__enc_copy)
wbinvd /* Invalidate any cache entries */
/* Copy/encrypt up to 2MB at a time */
- movq $PMD_PAGE_SIZE, %r12
+ movq $PMD_SIZE, %r12
1:
cmpq %r12, %r9
jnb 2f
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index f415498d3175..c6efcf559d88 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -93,7 +93,7 @@ struct sme_populate_pgd_data {
* section is 2MB aligned to allow for simple pagetable setup using only
* PMD entries (see vmlinux.lds.S).
*/
-static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch");
+static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
@@ -198,8 +198,8 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
- ppd->vaddr += PMD_PAGE_SIZE;
- ppd->paddr += PMD_PAGE_SIZE;
+ ppd->vaddr += PMD_SIZE;
+ ppd->paddr += PMD_SIZE;
}
}
@@ -225,11 +225,11 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
vaddr_end = ppd->vaddr_end;
/* If start is not 2MB aligned, create PTE entries */
- ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
+ ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE);
__sme_map_range_pte(ppd);
/* Create PMD entries */
- ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
+ ppd->vaddr_end = vaddr_end & PMD_MASK;
__sme_map_range_pmd(ppd);
/* If end is not 2MB aligned, create PTE entries */
@@ -325,7 +325,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
/* Physical addresses gives us the identity mapped virtual addresses */
kernel_start = __pa_symbol(_text);
- kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
+ kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@@ -355,12 +355,12 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* executable encryption area size:
* stack page (PAGE_SIZE)
* encryption routine page (PAGE_SIZE)
- * intermediate copy buffer (PMD_PAGE_SIZE)
+ * intermediate copy buffer (PMD_SIZE)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
execute_start = workarea_start;
- execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
+ execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
/*
@@ -383,7 +383,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* before it is mapped.
*/
workarea_len = execute_len + pgtable_area_len;
- workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
+ workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE);
/*
* Set the address to the start of where newly created pagetable
@@ -600,7 +600,8 @@ void __init sme_enable(struct boot_params *bp)
cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr |
((u64)bp->ext_cmd_line_ptr << 32));
- cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer));
+ if (cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)) < 0)
+ return;
if (!strncmp(buffer, cmdline_on, sizeof(buffer)))
sme_me_mask = me_mask;
diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c
index 423b21e80929..3d2f7f0a6ed1 100644
--- a/arch/x86/mm/pat/cpa-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
@@ -136,10 +136,10 @@ static int pageattr_test(void)
failed += print_split(&sa);
for (i = 0; i < NTEST; i++) {
- unsigned long pfn = prandom_u32_max(max_pfn_mapped);
+ unsigned long pfn = get_random_u32_below(max_pfn_mapped);
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
- len[i] = prandom_u32_max(NPAGES);
+ len[i] = get_random_u32_below(NPAGES);
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 66a209f7eb86..de10800cd4dd 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -43,6 +43,7 @@
#include <linux/rbtree.h>
#include <asm/cacheflush.h>
+#include <asm/cacheinfo.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
@@ -60,41 +61,34 @@
#undef pr_fmt
#define pr_fmt(fmt) "" fmt
-static bool __read_mostly pat_bp_initialized;
static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __initdata pat_force_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __read_mostly pat_bp_enabled;
-static bool __read_mostly pat_cm_initialized;
+static u64 __ro_after_init pat_msr_val;
/*
* PAT support is enabled by default, but can be disabled for
* various user-requested or hardware-forced reasons:
*/
-void pat_disable(const char *msg_reason)
+static void __init pat_disable(const char *msg_reason)
{
if (pat_disabled)
return;
- if (pat_bp_initialized) {
- WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
- return;
- }
-
pat_disabled = true;
pr_info("x86/PAT: %s\n", msg_reason);
+
+ memory_caching_control &= ~CACHE_PAT;
}
static int __init nopat(char *str)
{
pat_disable("PAT support disabled via boot option.");
- pat_force_disabled = true;
return 0;
}
early_param("nopat", nopat);
bool pat_enabled(void)
{
- return pat_bp_enabled;
+ return !pat_disabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -165,10 +159,10 @@ static inline void set_page_memtype(struct page *pg,
break;
}
+ old_flags = READ_ONCE(pg->flags);
do {
- old_flags = pg->flags;
new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
- } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+ } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags));
}
#else
static inline enum page_cache_mode get_page_memtype(struct page *pg)
@@ -192,7 +186,8 @@ enum {
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
-static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
+ char *msg)
{
enum page_cache_mode cache;
char *cache_mode;
@@ -219,14 +214,12 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
* configuration.
* Using lower indices is preferred, so we start with highest index.
*/
-static void __init_cache_modes(u64 pat)
+static void __init init_cache_modes(u64 pat)
{
enum page_cache_mode cache;
char pat_msg[33];
int i;
- WARN_ON_ONCE(pat_cm_initialized);
-
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
@@ -234,34 +227,9 @@ static void __init_cache_modes(u64 pat)
update_cache_mode_entry(i, cache);
}
pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
-
- pat_cm_initialized = true;
-}
-
-#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-
-static void pat_bp_init(u64 pat)
-{
- u64 tmp_pat;
-
- if (!boot_cpu_has(X86_FEATURE_PAT)) {
- pat_disable("PAT not supported by the CPU.");
- return;
- }
-
- rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
- if (!tmp_pat) {
- pat_disable("PAT support disabled by the firmware.");
- return;
- }
-
- wrmsrl(MSR_IA32_CR_PAT, pat);
- pat_bp_enabled = true;
-
- __init_cache_modes(pat);
}
-static void pat_ap_init(u64 pat)
+void pat_cpu_init(void)
{
if (!boot_cpu_has(X86_FEATURE_PAT)) {
/*
@@ -271,30 +239,39 @@ static void pat_ap_init(u64 pat)
panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
}
- wrmsrl(MSR_IA32_CR_PAT, pat);
+ wrmsrl(MSR_IA32_CR_PAT, pat_msr_val);
}
-void __init init_cache_modes(void)
+/**
+ * pat_bp_init - Initialize the PAT MSR value and PAT table
+ *
+ * This function initializes PAT MSR value and PAT table with an OS-defined
+ * value to enable additional cache attributes, WC, WT and WP.
+ *
+ * This function prepares the calls of pat_cpu_init() via cache_cpu_init()
+ * on all CPUs.
+ */
+void __init pat_bp_init(void)
{
- u64 pat = 0;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \
+ (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \
+ ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \
+ ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \
+ ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
- if (pat_cm_initialized)
- return;
- if (boot_cpu_has(X86_FEATURE_PAT)) {
- /*
- * CPU supports PAT. Set PAT table to be consistent with
- * PAT MSR. This case supports "nopat" boot option, and
- * virtual machine environments which support PAT without
- * MTRRs. In specific, Xen has unique setup to PAT MSR.
- *
- * If PAT MSR returns 0, it is considered invalid and emulates
- * as No PAT.
- */
- rdmsrl(MSR_IA32_CR_PAT, pat);
- }
+ if (!IS_ENABLED(CONFIG_X86_PAT))
+ pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
+
+ if (!cpu_feature_enabled(X86_FEATURE_PAT))
+ pat_disable("PAT not supported by the CPU.");
+ else
+ rdmsrl(MSR_IA32_CR_PAT, pat_msr_val);
+
+ if (!pat_msr_val) {
+ pat_disable("PAT support disabled by the firmware.");
- if (!pat) {
/*
* No PAT. Emulate the PAT table that corresponds to the two
* cache bits, PWT (Write Through) and PCD (Cache Disable).
@@ -313,40 +290,22 @@ void __init init_cache_modes(void)
* NOTE: When WC or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
- } else if (!pat_force_disabled && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
- /*
- * Clearly PAT is enabled underneath. Allow pat_enabled() to
- * reflect this.
- */
- pat_bp_enabled = true;
+ pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
}
- __init_cache_modes(pat);
-}
-
-/**
- * pat_init - Initialize the PAT MSR and PAT table on the current CPU
- *
- * This function initializes PAT MSR and PAT table with an OS-defined value
- * to enable additional cache attributes, WC, WT and WP.
- *
- * This function must be called on all CPUs using the specific sequence of
- * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
- * procedure for PAT.
- */
-void pat_init(void)
-{
- u64 pat;
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
-#ifndef CONFIG_X86_PAT
- pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
-#endif
-
- if (pat_disabled)
+ /*
+ * Xen PV doesn't allow to set PAT MSR, but all cache modes are
+ * supported.
+ * When running as TDX guest setting the PAT MSR won't work either
+ * due to the requirement to set CR0.CD when doing so. Rely on
+ * firmware to have set the PAT MSR correctly.
+ */
+ if (pat_disabled ||
+ cpu_feature_enabled(X86_FEATURE_XENPV) ||
+ cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ init_cache_modes(pat_msr_val);
return;
+ }
if ((c->x86_vendor == X86_VENDOR_INTEL) &&
(((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
@@ -371,8 +330,7 @@ void pat_init(void)
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
@@ -400,19 +358,14 @@ void pat_init(void)
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
+ pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
}
- if (!pat_bp_initialized) {
- pat_bp_init(pat);
- pat_bp_initialized = true;
- } else {
- pat_ap_init(pat);
- }
-}
+ memory_caching_control |= CACHE_PAT;
+ init_cache_modes(pat_msr_val);
#undef PAT
+}
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
@@ -1046,7 +999,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
ret = reserve_pfn_range(paddr, size, prot, 0);
if (ret == 0 && vma)
- vma->vm_flags |= VM_PAT;
+ vm_flags_set(vma, VM_PAT);
return ret;
}
@@ -1092,7 +1045,7 @@ void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
* can be for the entire vma (in which case pfn, size are zero).
*/
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size)
+ unsigned long size, bool mm_wr_locked)
{
resource_size_t paddr;
unsigned long prot;
@@ -1111,18 +1064,26 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
- if (vma)
- vma->vm_flags &= ~VM_PAT;
+ if (vma) {
+ if (mm_wr_locked)
+ vm_flags_clear(vma, VM_PAT);
+ else
+ __vm_flags_mod(vma, 0, VM_PAT);
+ }
}
/*
- * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
- * with the old vma after its pfnmap page table has been removed. The new
- * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
+ * untrack_pfn_clear is called if the following situation fits:
+ *
+ * 1) while mremapping a pfnmap for a new region, with the old vma after
+ * its pfnmap page table has been removed. The new vma has a new pfnmap
+ * to the same pfn & cache type with VM_PAT set.
+ * 2) while duplicating vm area, the new vma fails to copy the pgtable from
+ * old vma.
*/
-void untrack_pfn_moved(struct vm_area_struct *vma)
+void untrack_pfn_clear(struct vm_area_struct *vma)
{
- vma->vm_flags &= ~VM_PAT;
+ vm_flags_clear(vma, VM_PAT);
}
pgprot_t pgprot_writecombine(pgprot_t prot)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 2e5a045731de..7159cf787613 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
+#include <linux/memregion.h>
#include <asm/e820/api.h>
#include <asm/processor.h>
@@ -219,6 +220,23 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
#ifdef CONFIG_X86_64
+/*
+ * The kernel image is mapped into two places in the virtual address space
+ * (addresses without KASLR, of course):
+ *
+ * 1. The kernel direct map (0xffff880000000000)
+ * 2. The "high kernel map" (0xffffffff81000000)
+ *
+ * We actually execute out of #2. If we get the address of a kernel symbol, it
+ * points to #2, but almost all physical-to-virtual translations point to #1.
+ *
+ * This is so that we can have both a directmap of all physical memory *and*
+ * take full advantage of the the limited (s32) immediate addressing range (2G)
+ * of x86_64.
+ *
+ * See Documentation/arch/x86/x86_64/mm.rst for more detail.
+ */
+
static inline unsigned long highmap_start_pfn(void)
{
return __pa_symbol(_text) >> PAGE_SHIFT;
@@ -330,6 +348,23 @@ void arch_invalidate_pmem(void *addr, size_t size)
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif
+#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+bool cpu_cache_has_invalidate_memregion(void)
+{
+ return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM);
+
+int cpu_cache_invalidate_memregion(int res_desc)
+{
+ if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion()))
+ return -ENXIO;
+ wbinvd_on_all_cpus();
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM);
+#endif
+
static void __cpa_flush_all(void *arg)
{
unsigned long cache = (unsigned long)arg;
@@ -587,10 +622,6 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
{
unsigned long end;
- /* Kernel text is rw at boot up */
- if (system_state == SYSTEM_BOOTING)
- return new;
-
/*
* 32-bit has some unfixable W+X issues, like EFI code
* and writeable data being in the same page. Disable
@@ -747,11 +778,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
switch (level) {
case PG_LEVEL_1G:
phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PUD_PAGE_MASK;
+ offset = virt_addr & ~PUD_MASK;
break;
case PG_LEVEL_2M:
phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
- offset = virt_addr & ~PMD_PAGE_MASK;
+ offset = virt_addr & ~PMD_MASK;
break;
default:
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1041,7 +1072,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
- pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+ pfninc = PMD_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
@@ -1628,8 +1659,11 @@ repeat:
return err;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
+/*
+ * Check the directmap and "high kernel map" 'aliases'.
+ */
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
@@ -1653,6 +1687,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /* Directmap always has NX set, do not modify. */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@@ -1675,6 +1715,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
+ /*
+ * [_text, _brk_end) also covers data, do not modify NX except
+ * in cases where the highmap is the primary target.
+ */
+ if (__supported_pte_mask & _PAGE_NX) {
+ alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
+ alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
+ }
+
cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
@@ -1687,12 +1736,19 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
}
-static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
unsigned long numpages = cpa->numpages;
unsigned long rempages = numpages;
int ret = 0;
+ /*
+ * No changes, easy!
+ */
+ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
+ !cpa->force_split)
+ return ret;
+
while (rempages) {
/*
* Store the remaining nr of pages for the large page
@@ -1705,13 +1761,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
- ret = __change_page_attr(cpa, checkalias);
+ ret = __change_page_attr(cpa, primary);
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
goto out;
- if (checkalias) {
+ if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
ret = cpa_process_alias(cpa);
if (ret)
goto out;
@@ -1739,7 +1795,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
struct page **pages)
{
struct cpa_data cpa;
- int ret, cache, checkalias;
+ int ret, cache;
memset(&cpa, 0, sizeof(cpa));
@@ -1785,20 +1841,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
- cpa.flags = 0;
+ cpa.flags = in_flag;
cpa.curpage = 0;
cpa.force_split = force_split;
- if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
- cpa.flags |= in_flag;
-
- /* No alias checking for _NX bit modifications */
- checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
- /* Has caller explicitly disabled alias checking? */
- if (in_flag & CPA_NO_CHECK_ALIAS)
- checkalias = 0;
-
- ret = __change_page_attr_set_clr(&cpa, checkalias);
+ ret = __change_page_attr_set_clr(&cpa, 1);
/*
* Check whether we really changed something:
@@ -2029,6 +2076,16 @@ int set_memory_ro(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
+int set_memory_rox(unsigned long addr, int numpages)
+{
+ pgprot_t clr = __pgprot(_PAGE_RW);
+
+ if (__supported_pte_mask & _PAGE_NX)
+ clr.pgprot |= _PAGE_NX;
+
+ return change_page_attr_clear(&addr, numpages, clr, 0);
+}
+
int set_memory_rw(unsigned long addr, int numpages)
{
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
@@ -2041,11 +2098,9 @@ int set_memory_np(unsigned long addr, int numpages)
int set_memory_np_noalias(unsigned long addr, int numpages)
{
- int cpa_flags = CPA_NO_CHECK_ALIAS;
-
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
- cpa_flags, NULL);
+ CPA_NO_CHECK_ALIAS, NULL);
}
int set_memory_4k(unsigned long addr, int numpages)
@@ -2120,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (hv_is_isolation_supported())
- return hv_set_mem_host_visibility(addr, numpages, !enc);
-
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return __set_memory_enc_pgtable(addr, numpages, enc);
@@ -2262,7 +2314,7 @@ static int __set_pages_p(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
- .flags = 0};
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting present flag. otherwise,
@@ -2270,7 +2322,7 @@ static int __set_pages_p(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
static int __set_pages_np(struct page *page, int numpages)
@@ -2281,7 +2333,7 @@ static int __set_pages_np(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0};
+ .flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting not present flag. otherwise,
@@ -2289,7 +2341,7 @@ static int __set_pages_np(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
- return __change_page_attr_set_clr(&cpa, 0);
+ return __change_page_attr_set_clr(&cpa, 1);
}
int set_direct_map_invalid_noflush(struct page *page)
@@ -2360,7 +2412,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
- .flags = 0,
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
@@ -2373,7 +2425,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
out:
@@ -2403,12 +2455,12 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
- .flags = 0,
+ .flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
- retval = __change_page_attr_set_clr(&cpa, 0);
+ retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
return retval;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8525f2876fb4..e4f499eb0f29 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -299,9 +299,6 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
pud_t *pud;
int i;
- if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
- return;
-
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
@@ -434,10 +431,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
+ if (sizeof(pmds) != 0 &&
+ preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
goto out_free_pgd;
- if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
+ if (sizeof(u_pmds) != 0 &&
+ preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
goto out_free_pmds;
if (paravirt_pgd_alloc(mm) != 0)
@@ -451,17 +450,22 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
spin_lock(&pgd_lock);
pgd_ctor(mm, pgd);
- pgd_prepopulate_pmd(mm, pgd, pmds);
- pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
+ if (sizeof(pmds) != 0)
+ pgd_prepopulate_pmd(mm, pgd, pmds);
+
+ if (sizeof(u_pmds) != 0)
+ pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
spin_unlock(&pgd_lock);
return pgd;
out_free_user_pmds:
- free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
+ if (sizeof(u_pmds) != 0)
+ free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
- free_pmds(mm, pmds, PREALLOCATED_PMDS);
+ if (sizeof(pmds) != 0)
+ free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
_pgd_free(pgd);
out:
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index ffe3b3a087fe..78414c6d1b5e 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -592,7 +592,7 @@ static void pti_set_kernel_image_nonglobal(void)
* of the image.
*/
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+ unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE);
/*
* This clears _PAGE_GLOBAL from the entire kernel image.
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9a85d7..267acf27480a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -154,26 +154,30 @@ static inline u16 user_pcid(u16 asid)
return ret;
}
-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
{
+ unsigned long cr3 = __sme_pa(pgd) | lam;
+
if (static_cpu_has(X86_FEATURE_PCID)) {
- return __sme_pa(pgd) | kern_pcid(asid);
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(pgd);
}
+
+ return cr3;
}
-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+ unsigned long lam)
{
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+ return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
}
/*
@@ -274,15 +278,16 @@ static inline void invalidate_user_asid(u16 asid)
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
-static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+ bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
- new_mm_cr3 = build_cr3(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
} else {
- new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
}
/*
@@ -491,6 +496,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
@@ -520,7 +526,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -552,10 +559,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* instruction.
*/
if (real_prev == next) {
+ /* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
+ * If this races with another thread that enables lam, 'new_lam'
+ * might not match tlbstate_lam_cr3_mask().
+ */
+
+ /*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
@@ -622,15 +635,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}
+ set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -691,6 +705,10 @@ void initialize_tlbstate_and_flush(void)
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+ /* LAM expected to be disabled */
+ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+ WARN_ON(mm_lam_cr3_mask(mm));
+
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
* doesn't work like other CR4 bits because it can only be set from
@@ -699,8 +717,8 @@ void initialize_tlbstate_and_flush(void)
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
- /* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm->pgd, 0));
+ /* Disable LAM, force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm->pgd, 0, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@ -708,6 +726,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+ set_tlbstate_lam_mode(mm);
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -925,7 +944,7 @@ void flush_tlb_multi(const struct cpumask *cpumask,
}
/*
- * See Documentation/x86/tlb.rst for details. We choose 33
+ * See Documentation/arch/x86/tlb.rst for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
@@ -1071,8 +1090,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
*/
unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
- this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ unsigned long cr3 =
+ build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+ tlbstate_lam_cr3_mask());
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
@@ -1205,7 +1226,7 @@ void __flush_tlb_all(void)
*/
VM_WARN_ON_ONCE(preemptible());
- if (boot_cpu_has(X86_FEATURE_PGE)) {
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 00127abd89ee..438adb695daa 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,8 +11,8 @@
#include <linux/bpf.h>
#include <linux/memory.h>
#include <linux/sort.h>
-#include <linux/init.h>
#include <asm/extable.h>
+#include <asm/ftrace.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
@@ -341,6 +341,13 @@ static int emit_call(u8 **pprog, void *func, void *ip)
return emit_patch(pprog, func, ip, 0xE8);
}
+static int emit_rsb_call(u8 **pprog, void *func, void *ip)
+{
+ OPTIMIZER_HIDE_VAR(func);
+ x86_call_depth_emit_accounting(pprog, func);
+ return emit_patch(pprog, func, ip, 0xE8);
+}
+
static int emit_jump(u8 **pprog, void *func, void *ip)
{
return emit_patch(pprog, func, ip, 0xE9);
@@ -389,18 +396,6 @@ out:
return ret;
}
-int __init bpf_arch_init_dispatcher_early(void *ip)
-{
- const u8 *nop_insn = x86_nops[5];
-
- if (is_endbr(*(u32 *)ip))
- ip += ENDBR_INSN_SIZE;
-
- if (memcmp(ip, nop_insn, X86_PATCH_SIZE))
- text_poke_early(ip, nop_insn, X86_PATCH_SIZE);
- return 0;
-}
-
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr)
{
@@ -430,7 +425,10 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
EMIT2(0xFF, 0xE0 + reg);
} else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) {
OPTIMIZER_HIDE_VAR(reg);
- emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
+ if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
+ emit_jump(&prog, &__x86_indirect_jump_thunk_array[reg], ip);
+ else
+ emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else {
EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
@@ -445,7 +443,7 @@ static void emit_return(u8 **pprog, u8 *ip)
u8 *prog = *pprog;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
- emit_jump(&prog, &__x86_return_thunk, ip);
+ emit_jump(&prog, x86_return_thunk, ip);
} else {
EMIT1(0xC3); /* ret */
if (IS_ENABLED(CONFIG_SLS))
@@ -904,6 +902,65 @@ static void emit_nops(u8 **pprog, int len)
*pprog = prog;
}
+/* emit the 3-byte VEX prefix
+ *
+ * r: same as rex.r, extra bit for ModRM reg field
+ * x: same as rex.x, extra bit for SIB index field
+ * b: same as rex.b, extra bit for ModRM r/m, or SIB base
+ * m: opcode map select, encoding escape bytes e.g. 0x0f38
+ * w: same as rex.w (32 bit or 64 bit) or opcode specific
+ * src_reg2: additional source reg (encoded as BPF reg)
+ * l: vector length (128 bit or 256 bit) or reserved
+ * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
+ */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+ bool w, u8 src_reg2, bool l, u8 pp)
+{
+ u8 *prog = *pprog;
+ const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */
+ u8 b1, b2;
+ u8 vvvv = reg2hex[src_reg2];
+
+ /* reg2hex gives only the lower 3 bit of vvvv */
+ if (is_ereg(src_reg2))
+ vvvv |= 1 << 3;
+
+ /*
+ * 2nd byte of 3-byte VEX prefix
+ * ~ means bit inverted encoding
+ *
+ * 7 0
+ * +---+---+---+---+---+---+---+---+
+ * |~R |~X |~B | m |
+ * +---+---+---+---+---+---+---+---+
+ */
+ b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+ /*
+ * 3rd byte of 3-byte VEX prefix
+ *
+ * 7 0
+ * +---+---+---+---+---+---+---+---+
+ * | W | ~vvvv | L | pp |
+ * +---+---+---+---+---+---+---+---+
+ */
+ b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3);
+
+ EMIT3(b0, b1, b2);
+ *pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+ u8 *prog = *pprog;
+ bool r = is_ereg(dst_reg);
+ u8 m = 2; /* escape code 0f38 */
+
+ emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+ EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+ *pprog = prog;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -946,6 +1003,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
u8 b2 = 0, b3 = 0;
u8 *start_of_ldx;
s64 jmp_offset;
+ s16 insn_off;
u8 jmp_cond;
u8 *func;
int nops;
@@ -1150,17 +1208,38 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
case BPF_ALU64 | BPF_LSH | BPF_X:
case BPF_ALU64 | BPF_RSH | BPF_X:
case BPF_ALU64 | BPF_ARSH | BPF_X:
+ /* BMI2 shifts aren't better when shift count is already in rcx */
+ if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+ /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+ bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+ u8 op;
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH:
+ op = 1; /* prefix 0x66 */
+ break;
+ case BPF_RSH:
+ op = 3; /* prefix 0xf2 */
+ break;
+ case BPF_ARSH:
+ op = 2; /* prefix 0xf3 */
+ break;
+ }
+
+ emit_shiftx(&prog, dst_reg, src_reg, w, op);
- /* Check for bad case when dst_reg == rcx */
- if (dst_reg == BPF_REG_4) {
- /* mov r11, dst_reg */
- EMIT_mov(AUX_REG, dst_reg);
- dst_reg = AUX_REG;
+ break;
}
if (src_reg != BPF_REG_4) { /* common case */
- EMIT1(0x51); /* push rcx */
-
+ /* Check for bad case when dst_reg == rcx */
+ if (dst_reg == BPF_REG_4) {
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+ dst_reg = AUX_REG;
+ } else {
+ EMIT1(0x51); /* push rcx */
+ }
/* mov rcx, src_reg */
EMIT_mov(BPF_REG_4, src_reg);
}
@@ -1172,12 +1251,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
b3 = simple_alu_opcodes[BPF_OP(insn->code)];
EMIT2(0xD3, add_1reg(b3, dst_reg));
- if (src_reg != BPF_REG_4)
- EMIT1(0x59); /* pop rcx */
+ if (src_reg != BPF_REG_4) {
+ if (insn->dst_reg == BPF_REG_4)
+ /* mov dst_reg, r11 */
+ EMIT_mov(insn->dst_reg, AUX_REG);
+ else
+ EMIT1(0x59); /* pop rcx */
+ }
- if (insn->dst_reg == BPF_REG_4)
- /* mov dst_reg, r11 */
- EMIT_mov(insn->dst_reg, AUX_REG);
break;
case BPF_ALU | BPF_END | BPF_FROM_BE:
@@ -1239,8 +1320,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
/* speculation barrier */
case BPF_ST | BPF_NOSPEC:
- if (boot_cpu_has(X86_FEATURE_XMM2))
- EMIT_LFENCE();
+ EMIT_LFENCE();
break;
/* ST: *(u8*)(dst_reg + off) = imm */
@@ -1290,57 +1370,52 @@ st: if (is_imm8(insn->off))
case BPF_LDX | BPF_PROBE_MEM | BPF_W:
case BPF_LDX | BPF_MEM | BPF_DW:
case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+ insn_off = insn->off;
+
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
- /* Though the verifier prevents negative insn->off in BPF_PROBE_MEM
- * add abs(insn->off) to the limit to make sure that negative
- * offset won't be an issue.
- * insn->off is s16, so it won't affect valid pointers.
+ /* Conservatively check that src_reg + insn->off is a kernel address:
+ * src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
+ * src_reg is used as scratch for src_reg += insn->off and restored
+ * after emit_ldx if necessary
*/
- u64 limit = TASK_SIZE_MAX + PAGE_SIZE + abs(insn->off);
- u8 *end_of_jmp1, *end_of_jmp2;
- /* Conservatively check that src_reg + insn->off is a kernel address:
- * 1. src_reg + insn->off >= limit
- * 2. src_reg + insn->off doesn't become small positive.
- * Cannot do src_reg + insn->off >= limit in one branch,
- * since it needs two spare registers, but JIT has only one.
+ u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
+ u8 *end_of_jmp;
+
+ /* At end of these emitted checks, insn->off will have been added
+ * to src_reg, so no need to do relative load with insn->off offset
*/
+ insn_off = 0;
/* movabsq r11, limit */
EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
EMIT((u32)limit, 4);
EMIT(limit >> 32, 4);
+
+ if (insn->off) {
+ /* add src_reg, insn->off */
+ maybe_emit_1mod(&prog, src_reg, true);
+ EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
+ }
+
/* cmp src_reg, r11 */
maybe_emit_mod(&prog, src_reg, AUX_REG, true);
EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
- /* if unsigned '<' goto end_of_jmp2 */
- EMIT2(X86_JB, 0);
- end_of_jmp1 = prog;
-
- /* mov r11, src_reg */
- emit_mov_reg(&prog, true, AUX_REG, src_reg);
- /* add r11, insn->off */
- maybe_emit_1mod(&prog, AUX_REG, true);
- EMIT2_off32(0x81, add_1reg(0xC0, AUX_REG), insn->off);
- /* jmp if not carry to start_of_ldx
- * Otherwise ERR_PTR(-EINVAL) + 128 will be the user addr
- * that has to be rejected.
- */
- EMIT2(0x73 /* JNC */, 0);
- end_of_jmp2 = prog;
+
+ /* if unsigned '>=', goto load */
+ EMIT2(X86_JAE, 0);
+ end_of_jmp = prog;
/* xor dst_reg, dst_reg */
emit_mov_imm32(&prog, false, dst_reg, 0);
/* jmp byte_after_ldx */
EMIT2(0xEB, 0);
- /* populate jmp_offset for JB above to jump to xor dst_reg */
- end_of_jmp1[-1] = end_of_jmp2 - end_of_jmp1;
- /* populate jmp_offset for JNC above to jump to start_of_ldx */
+ /* populate jmp_offset for JAE above to jump to start_of_ldx */
start_of_ldx = prog;
- end_of_jmp2[-1] = start_of_ldx - end_of_jmp2;
+ end_of_jmp[-1] = start_of_ldx - end_of_jmp;
}
- emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
+ emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
struct exception_table_entry *ex;
u8 *_insn = image + proglen + (start_of_ldx - temp);
@@ -1349,6 +1424,18 @@ st: if (is_imm8(insn->off))
/* populate jmp_offset for JMP above */
start_of_ldx[-1] = prog - start_of_ldx;
+ if (insn->off && src_reg != dst_reg) {
+ /* sub src_reg, insn->off
+ * Restore src_reg after "add src_reg, insn->off" in prev
+ * if statement. But if src_reg == dst_reg, emit_ldx
+ * above already clobbered src_reg, so no need to restore.
+ * If add src_reg, insn->off was unnecessary, no need to
+ * restore either.
+ */
+ maybe_emit_1mod(&prog, src_reg, true);
+ EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
+ }
+
if (!bpf_prog->aux->extable)
break;
@@ -1446,19 +1533,26 @@ st: if (is_imm8(insn->off))
break;
/* call */
- case BPF_JMP | BPF_CALL:
+ case BPF_JMP | BPF_CALL: {
+ int offs;
+
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
EMIT3_off32(0x48, 0x8B, 0x85,
-round_up(bpf_prog->aux->stack_depth, 8) - 8);
- if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
+ if (!imm32)
return -EINVAL;
+ offs = 7 + x86_call_depth_emit_accounting(&prog, func);
} else {
- if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
+ if (!imm32)
return -EINVAL;
+ offs = x86_call_depth_emit_accounting(&prog, func);
}
+ if (emit_call(&prog, func, image + addrs[i - 1] + offs))
+ return -EINVAL;
break;
+ }
case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
@@ -1763,62 +1857,59 @@ emit_jmp:
return proglen;
}
-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
+static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
int stack_size)
{
- int i, j, arg_size, nr_regs;
+ int i, j, arg_size;
+ bool next_same_struct = false;
+
/* Store function arguments to stack.
* For a function that accepts two pointers the sequence will be:
* mov QWORD PTR [rbp-0x10],rdi
* mov QWORD PTR [rbp-0x8],rsi
*/
- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
- nr_regs = (m->arg_size[i] + 7) / 8;
+ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+ /* The arg_size is at most 16 bytes, enforced by the verifier. */
+ arg_size = m->arg_size[j];
+ if (arg_size > 8) {
arg_size = 8;
- } else {
- nr_regs = 1;
- arg_size = m->arg_size[i];
+ next_same_struct = !next_same_struct;
}
- while (nr_regs) {
- emit_stx(prog, bytes_to_bpf_size(arg_size),
- BPF_REG_FP,
- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
- -(stack_size - j * 8));
- nr_regs--;
- j++;
- }
+ emit_stx(prog, bytes_to_bpf_size(arg_size),
+ BPF_REG_FP,
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ -(stack_size - i * 8));
+
+ j = next_same_struct ? j : j + 1;
}
}
-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
+static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
int stack_size)
{
- int i, j, arg_size, nr_regs;
+ int i, j, arg_size;
+ bool next_same_struct = false;
/* Restore function arguments from stack.
* For a function that accepts two pointers the sequence will be:
* EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
* EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
*/
- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
- nr_regs = (m->arg_size[i] + 7) / 8;
+ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+ /* The arg_size is at most 16 bytes, enforced by the verifier. */
+ arg_size = m->arg_size[j];
+ if (arg_size > 8) {
arg_size = 8;
- } else {
- nr_regs = 1;
- arg_size = m->arg_size[i];
+ next_same_struct = !next_same_struct;
}
- while (nr_regs) {
- emit_ldx(prog, bytes_to_bpf_size(arg_size),
- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
- BPF_REG_FP,
- -(stack_size - j * 8));
- nr_regs--;
- j++;
- }
+ emit_ldx(prog, bytes_to_bpf_size(arg_size),
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ BPF_REG_FP,
+ -(stack_size - i * 8));
+
+ j = next_same_struct ? j : j + 1;
}
}
@@ -1826,10 +1917,6 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
struct bpf_tramp_link *l, int stack_size,
int run_ctx_off, bool save_ret)
{
- void (*exit)(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_exit;
- u64 (*enter)(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx) = __bpf_prog_enter;
u8 *prog = *pprog;
u8 *jmp_insn;
int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
@@ -1848,23 +1935,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
*/
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_1, -run_ctx_off + ctx_cookie_off);
- if (p->aux->sleepable) {
- enter = __bpf_prog_enter_sleepable;
- exit = __bpf_prog_exit_sleepable;
- } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
- enter = __bpf_prog_enter_struct_ops;
- exit = __bpf_prog_exit_struct_ops;
- } else if (p->expected_attach_type == BPF_LSM_CGROUP) {
- enter = __bpf_prog_enter_lsm_cgroup;
- exit = __bpf_prog_exit_lsm_cgroup;
- }
-
/* arg1: mov rdi, progs[i] */
emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p);
/* arg2: lea rsi, [rbp - ctx_cookie_off] */
EMIT4(0x48, 0x8D, 0x75, -run_ctx_off);
- if (emit_call(&prog, enter, prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_enter(p), prog))
return -EINVAL;
/* remember prog start time returned by __bpf_prog_enter */
emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);
@@ -1885,7 +1961,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
/* call JITed bpf program or interpreter */
- if (emit_call(&prog, p->bpf_func, prog))
+ if (emit_rsb_call(&prog, p->bpf_func, prog))
return -EINVAL;
/*
@@ -1909,7 +1985,7 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
/* arg3: lea rdx, [rbp - run_ctx_off] */
EMIT4(0x48, 0x8D, 0x55, -run_ctx_off);
- if (emit_call(&prog, exit, prog))
+ if (emit_rsb_call(&prog, bpf_trampoline_exit(p), prog))
return -EINVAL;
*pprog = prog;
@@ -2059,8 +2135,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
struct bpf_tramp_links *tlinks,
void *func_addr)
{
- int ret, i, nr_args = m->nr_args, extra_nregs = 0;
- int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
+ int i, ret, nr_regs = m->nr_args, stack_size = 0;
+ int regs_off, nregs_off, ip_off, run_ctx_off;
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
@@ -2069,17 +2145,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
u8 *prog;
bool save_ret;
- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
- if (nr_args > 6)
- return -ENOTSUPP;
-
- for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+ /* extra registers for struct arguments */
+ for (i = 0; i < m->nr_args; i++)
if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
- extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
- }
- if (nr_args + extra_nregs > 6)
+ nr_regs += (m->arg_size[i] + 7) / 8 - 1;
+
+ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
+ if (nr_regs > 6)
return -ENOTSUPP;
- stack_size += extra_nregs * 8;
/* Generated trampoline stack layout:
*
@@ -2093,7 +2166,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
* [ ... ]
* RBP - regs_off [ reg_arg1 ] program's ctx pointer
*
- * RBP - args_off [ arg regs count ] always
+ * RBP - nregs_off [ regs count ] always
*
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
*
@@ -2105,11 +2178,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (save_ret)
stack_size += 8;
+ stack_size += nr_regs * 8;
regs_off = stack_size;
- /* args count */
+ /* regs count */
stack_size += 8;
- args_off = stack_size;
+ nregs_off = stack_size;
if (flags & BPF_TRAMP_F_IP_ARG)
stack_size += 8; /* room for IP address argument */
@@ -2131,17 +2205,22 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
prog = image;
EMIT_ENDBR();
+ /*
+ * This is the direct-call trampoline, as such it needs accounting
+ * for the __fentry__ call.
+ */
+ x86_call_depth_emit_accounting(&prog, NULL);
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
/* Store number of argument registers of the traced function:
- * mov rax, nr_args + extra_nregs
- * mov QWORD PTR [rbp - args_off], rax
+ * mov rax, nr_regs
+ * mov QWORD PTR [rbp - nregs_off], rax
*/
- emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
@@ -2152,12 +2231,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
- save_regs(m, &prog, nr_args, regs_off);
+ save_regs(m, &prog, nr_regs, regs_off);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_call(&prog, __bpf_tramp_enter, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2182,14 +2261,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_CALL_ORIG) {
- restore_regs(m, &prog, nr_args, regs_off);
+ restore_regs(m, &prog, nr_regs, regs_off);
if (flags & BPF_TRAMP_F_ORIG_STACK) {
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
EMIT2(0xff, 0xd0); /* call *rax */
} else {
/* call original function */
- if (emit_call(&prog, orig_call, prog)) {
+ if (emit_rsb_call(&prog, orig_call, prog)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2223,7 +2302,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
}
if (flags & BPF_TRAMP_F_RESTORE_REGS)
- restore_regs(m, &prog, nr_args, regs_off);
+ restore_regs(m, &prog, nr_regs, regs_off);
/* This needs to be done regardless. If there were fmod_ret programs,
* the return value is only updated on the stack and still needs to be
@@ -2233,7 +2312,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
im->ip_epilogue = prog;
/* arg1: mov rdi, im */
emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im);
- if (emit_call(&prog, __bpf_tramp_exit, prog)) {
+ if (emit_rsb_call(&prog, __bpf_tramp_exit, prog)) {
ret = -EINVAL;
goto cleanup;
}
@@ -2491,7 +2570,7 @@ out_image:
}
if (bpf_jit_enable > 1)
- bpf_jit_dump(prog->len, proglen, pass + 1, image);
+ bpf_jit_dump(prog->len, proglen, pass + 1, rw_image);
if (image) {
if (!prog->is_func || extra_pass) {
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 2f82480fd430..ea2eb2ec90e2 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -1,4 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "PCI: " fmt
+
#include <linux/pci.h>
#include <linux/acpi.h>
#include <linux/init.h>
@@ -37,15 +40,15 @@ static int __init set_nouse_crs(const struct dmi_system_id *id)
static int __init set_ignore_seg(const struct dmi_system_id *id)
{
- printk(KERN_INFO "PCI: %s detected: ignoring ACPI _SEG\n", id->ident);
+ pr_info("%s detected: ignoring ACPI _SEG\n", id->ident);
pci_ignore_seg = true;
return 0;
}
static int __init set_no_e820(const struct dmi_system_id *id)
{
- printk(KERN_INFO "PCI: %s detected: not clipping E820 regions from _CRS\n",
- id->ident);
+ pr_info("%s detected: not clipping E820 regions from _CRS\n",
+ id->ident);
pci_use_e820 = false;
return 0;
}
@@ -231,10 +234,9 @@ void __init pci_acpi_crs_quirks(void)
else if (pci_probe & PCI_USE__CRS)
pci_use_crs = true;
- printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
- "if necessary, use \"pci=%s\" and report a bug\n",
- pci_use_crs ? "Using" : "Ignoring",
- pci_use_crs ? "nocrs" : "use_crs");
+ pr_info("%s host bridge windows from ACPI; if necessary, use \"pci=%s\" and report a bug\n",
+ pci_use_crs ? "Using" : "Ignoring",
+ pci_use_crs ? "nocrs" : "use_crs");
/* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */
if (pci_probe & PCI_NO_E820)
@@ -242,19 +244,17 @@ void __init pci_acpi_crs_quirks(void)
else if (pci_probe & PCI_USE_E820)
pci_use_e820 = true;
- printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n",
- pci_use_e820 ? "Using" : "Ignoring");
+ pr_info("%s E820 reservations for host bridge windows\n",
+ pci_use_e820 ? "Using" : "Ignoring");
if (pci_probe & (PCI_NO_E820 | PCI_USE_E820))
- printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n");
+ pr_info("Please notify linux-pci@vger.kernel.org so future kernels can do this automatically\n");
}
#ifdef CONFIG_PCI_MMCONFIG
static int check_segment(u16 seg, struct device *dev, char *estr)
{
if (seg) {
- dev_err(dev,
- "%s can't access PCI configuration "
- "space under this host bridge.\n",
+ dev_err(dev, "%s can't access configuration space under this host bridge\n",
estr);
return -EIO;
}
@@ -264,9 +264,7 @@ static int check_segment(u16 seg, struct device *dev, char *estr)
* just can't access extended configuration space of
* devices under this host bridge.
*/
- dev_warn(dev,
- "%s can't access extended PCI configuration "
- "space under this bridge.\n",
+ dev_warn(dev, "%s can't access extended configuration space under this bridge\n",
estr);
return 0;
@@ -421,9 +419,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
root->segment = domain = 0;
if (domain && !pci_domains_supported) {
- printk(KERN_WARNING "pci_bus %04x:%02x: "
- "ignored (multiple domains not supported)\n",
- domain, busnum);
+ pr_warn("pci_bus %04x:%02x: ignored (multiple domains not supported)\n",
+ domain, busnum);
return NULL;
}
@@ -491,7 +488,7 @@ int __init pci_acpi_init(void)
if (acpi_noirq)
return -ENODEV;
- printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
+ pr_info("Using ACPI for IRQ routing\n");
acpi_irq_penalty_init();
pcibios_enable_irq = acpi_pci_irq_enable;
pcibios_disable_irq = acpi_pci_irq_disable;
@@ -503,7 +500,7 @@ int __init pci_acpi_init(void)
* also do it here in case there are still broken drivers that
* don't use pci_enable_device().
*/
- printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ pr_info("Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
for_each_pci_dev(dev)
acpi_pci_irq_enable(dev);
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 615a76d70019..e3ec02e6ac9f 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -7,6 +7,7 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/vgaarb.h>
+#include <asm/amd_nb.h>
#include <asm/hpet.h>
#include <asm/pci_x86.h>
@@ -824,3 +825,82 @@ static void rs690_fix_64bit_dma(struct pci_dev *pdev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, 0x7910, rs690_fix_64bit_dma);
#endif
+
+#ifdef CONFIG_AMD_NB
+
+#define AMD_15B8_RCC_DEV2_EPF0_STRAP2 0x10136008
+#define AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK 0x00000080L
+
+static void quirk_clear_strap_no_soft_reset_dev2_f0(struct pci_dev *dev)
+{
+ u32 data;
+
+ if (!amd_smn_read(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, &data)) {
+ data &= ~AMD_15B8_RCC_DEV2_EPF0_STRAP2_NO_SOFT_RESET_DEV2_F0_MASK;
+ if (amd_smn_write(0, AMD_15B8_RCC_DEV2_EPF0_STRAP2, data))
+ pci_err(dev, "Failed to write data 0x%x\n", data);
+ } else {
+ pci_err(dev, "Failed to read data\n");
+ }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, 0x15b8, quirk_clear_strap_no_soft_reset_dev2_f0);
+#endif
+
+/*
+ * When returning from D3cold to D0, firmware on some Google Coral and Reef
+ * family Chromebooks with Intel Apollo Lake SoC clobbers the headers of
+ * both the L1 PM Substates capability and the previous capability for the
+ * "Celeron N3350/Pentium N4200/Atom E3900 Series PCI Express Port B #1".
+ *
+ * Save those values at enumeration-time and restore them at resume.
+ */
+
+static u16 prev_cap, l1ss_cap;
+static u32 prev_header, l1ss_header;
+
+static void chromeos_save_apl_pci_l1ss_capability(struct pci_dev *dev)
+{
+ int pos = PCI_CFG_SPACE_SIZE, prev = 0;
+ u32 header, pheader = 0;
+
+ while (pos) {
+ pci_read_config_dword(dev, pos, &header);
+ if (PCI_EXT_CAP_ID(header) == PCI_EXT_CAP_ID_L1SS) {
+ prev_cap = prev;
+ prev_header = pheader;
+ l1ss_cap = pos;
+ l1ss_header = header;
+ return;
+ }
+
+ prev = pos;
+ pheader = header;
+ pos = PCI_EXT_CAP_NEXT(header);
+ }
+}
+
+static void chromeos_fixup_apl_pci_l1ss_capability(struct pci_dev *dev)
+{
+ u32 header;
+
+ if (!prev_cap || !prev_header || !l1ss_cap || !l1ss_header)
+ return;
+
+ /* Fixup the header of L1SS Capability if missing */
+ pci_read_config_dword(dev, l1ss_cap, &header);
+ if (header != l1ss_header) {
+ pci_write_config_dword(dev, l1ss_cap, l1ss_header);
+ pci_info(dev, "restore L1SS Capability header (was %#010x now %#010x)\n",
+ header, l1ss_header);
+ }
+
+ /* Fixup the link to L1SS Capability if missing */
+ pci_read_config_dword(dev, prev_cap, &header);
+ if (header != prev_header) {
+ pci_write_config_dword(dev, prev_cap, prev_header);
+ pci_info(dev, "restore previous Capability header (was %#010x now %#010x)\n",
+ header, prev_header);
+ }
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_save_apl_pci_l1ss_capability);
+DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_INTEL, 0x5ad6, chromeos_fixup_apl_pci_l1ss_capability);
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 758cbfe55daa..4b3efaa82ab7 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -12,6 +12,7 @@
*/
#include <linux/acpi.h>
+#include <linux/efi.h>
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/bitmap.h>
@@ -442,17 +443,42 @@ static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used)
return mcfg_res.flags;
}
+static bool is_efi_mmio(u64 start, u64 end, enum e820_type not_used)
+{
+#ifdef CONFIG_EFI
+ efi_memory_desc_t *md;
+ u64 size, mmio_start, mmio_end;
+
+ for_each_efi_memory_desc(md) {
+ if (md->type == EFI_MEMORY_MAPPED_IO) {
+ size = md->num_pages << EFI_PAGE_SHIFT;
+ mmio_start = md->phys_addr;
+ mmio_end = mmio_start + size;
+
+ /*
+ * N.B. Caller supplies (start, start + size),
+ * so to match, mmio_end is the first address
+ * *past* the EFI_MEMORY_MAPPED_IO area.
+ */
+ if (mmio_start <= start && end <= mmio_end)
+ return true;
+ }
+ }
+#endif
+
+ return false;
+}
+
typedef bool (*check_reserved_t)(u64 start, u64 end, enum e820_type type);
static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
struct pci_mmcfg_region *cfg,
- struct device *dev, int with_e820)
+ struct device *dev, const char *method)
{
u64 addr = cfg->res.start;
u64 size = resource_size(&cfg->res);
u64 old_size = size;
int num_buses;
- char *method = with_e820 ? "E820" : "ACPI motherboard resources";
while (!is_reserved(addr, addr + size, E820_TYPE_RESERVED)) {
size >>= 1;
@@ -464,10 +490,10 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved,
return false;
if (dev)
- dev_info(dev, "MMCONFIG at %pR reserved in %s\n",
+ dev_info(dev, "MMCONFIG at %pR reserved as %s\n",
&cfg->res, method);
else
- pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n",
+ pr_info(PREFIX "MMCONFIG at %pR reserved as %s\n",
&cfg->res, method);
if (old_size != size) {
@@ -500,7 +526,8 @@ static bool __ref
pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early)
{
if (!early && !acpi_disabled) {
- if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0))
+ if (is_mmconf_reserved(is_acpi_reserved, cfg, dev,
+ "ACPI motherboard resource"))
return true;
if (dev)
@@ -513,6 +540,10 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
"MMCONFIG at %pR not reserved in "
"ACPI motherboard resources\n",
&cfg->res);
+
+ if (is_mmconf_reserved(is_efi_mmio, cfg, dev,
+ "EfiMemoryMappedIO"))
+ return true;
}
/*
@@ -527,7 +558,8 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e
/* Don't try to do this check unless configuration
type 1 is available. how about type 2 ?*/
if (raw_pci_ops)
- return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1);
+ return is_mmconf_reserved(e820__mapped_all, cfg, dev,
+ "E820 entry");
return false;
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index b94f727251b6..014c508e914d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -198,7 +198,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
i++;
}
kfree(v);
- return 0;
+ return msi_device_populate_sysfs(&dev->dev);
error:
if (ret == -ENOSYS)
@@ -254,7 +254,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
dev_dbg(&dev->dev,
"xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
}
- return 0;
+ return msi_device_populate_sysfs(&dev->dev);
error:
dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n",
@@ -346,7 +346,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret < 0)
goto out;
}
- ret = 0;
+ ret = msi_device_populate_sysfs(&dev->dev);
out:
return ret;
}
@@ -392,7 +392,10 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
msi_for_each_desc(msidesc, &dev->dev, MSI_DESC_ASSOCIATED) {
for (i = 0; i < msidesc->nvec_used; i++)
xen_destroy_irq(msidesc->irq + i);
+ msidesc->irq = 0;
}
+
+ msi_device_destroy_sysfs(&dev->dev);
}
static void xen_pv_teardown_msi_irqs(struct pci_dev *dev)
@@ -433,6 +436,7 @@ static struct msi_domain_ops xen_pci_msi_domain_ops = {
};
static struct msi_domain_info xen_pci_msi_domain_info = {
+ .flags = MSI_FLAG_PCI_MSIX | MSI_FLAG_FREE_MSI_DESCS | MSI_FLAG_DEV_SYSFS,
.ops = &xen_pci_msi_domain_ops,
};
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index a50245157685..543df9a1379d 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -2,5 +2,8 @@
KASAN_SANITIZE := n
GCOV_PROFILE := n
-obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EFI) += memmap.o quirks.o efi.o efi_$(BITS).o \
+ efi_stub_$(BITS).o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
+obj-$(CONFIG_EFI_FAKE_MEMMAP) += fake_mem.o
+obj-$(CONFIG_EFI_RUNTIME_MAP) += runtime-map.o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ebc98a68c400..f3f2d87cce1b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -214,9 +214,11 @@ int __init efi_memblock_x86_reserve_range(void)
data.desc_size = e->efi_memdesc_size;
data.desc_version = e->efi_memdesc_version;
- rv = efi_memmap_init_early(&data);
- if (rv)
- return rv;
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ rv = efi_memmap_init_early(&data);
+ if (rv)
+ return rv;
+ }
if (add_efi_memmap || do_efi_soft_reserve())
do_add_efi_memmap();
@@ -303,6 +305,50 @@ static void __init efi_clean_memmap(void)
}
}
+/*
+ * Firmware can use EfiMemoryMappedIO to request that MMIO regions be
+ * mapped by the OS so they can be accessed by EFI runtime services, but
+ * should have no other significance to the OS (UEFI r2.10, sec 7.2).
+ * However, most bootloaders and EFI stubs convert EfiMemoryMappedIO
+ * regions to E820_TYPE_RESERVED entries, which prevent Linux from
+ * allocating space from them (see remove_e820_regions()).
+ *
+ * Some platforms use EfiMemoryMappedIO entries for PCI MMCONFIG space and
+ * PCI host bridge windows, which means Linux can't allocate BAR space for
+ * hot-added devices.
+ *
+ * Remove large EfiMemoryMappedIO regions from the E820 map to avoid this
+ * problem.
+ *
+ * Retain small EfiMemoryMappedIO regions because on some platforms, these
+ * describe non-window space that's included in host bridge _CRS. If we
+ * assign that space to PCI devices, they don't work.
+ */
+static void __init efi_remove_e820_mmio(void)
+{
+ efi_memory_desc_t *md;
+ u64 size, start, end;
+ int i = 0;
+
+ for_each_efi_memory_desc(md) {
+ if (md->type == EFI_MEMORY_MAPPED_IO) {
+ size = md->num_pages << EFI_PAGE_SHIFT;
+ start = md->phys_addr;
+ end = start + size - 1;
+ if (size >= 256*1024) {
+ pr_info("Remove mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluMB) from e820 map\n",
+ i, start, end, size >> 20);
+ e820__range_remove(start, size,
+ E820_TYPE_RESERVED, 1);
+ } else {
+ pr_info("Not removing mem%02u: MMIO range=[0x%08llx-0x%08llx] (%lluKB) from e820 map\n",
+ i, start, end, size >> 10);
+ }
+ }
+ i++;
+ }
+}
+
void __init efi_print_memmap(void)
{
efi_memory_desc_t *md;
@@ -334,7 +380,7 @@ static int __init efi_systab_init(unsigned long phys)
return -ENOMEM;
}
- ret = efi_systab_check_header(hdr, 1);
+ ret = efi_systab_check_header(hdr);
if (ret) {
early_memunmap(p, size);
return ret;
@@ -474,6 +520,8 @@ void __init efi_init(void)
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi_clean_memmap();
+ efi_remove_e820_mmio();
+
if (efi_enabled(EFI_DBG))
efi_print_memmap();
}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index b36596bf0fc3..232acf418cfb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -389,10 +389,15 @@ static int __init efi_update_mappings(efi_memory_desc_t *md, unsigned long pf)
return err1 || err2;
}
-static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md)
+bool efi_disable_ibt_for_runtime __ro_after_init = true;
+
+static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *md,
+ bool has_ibt)
{
unsigned long pf = 0;
+ efi_disable_ibt_for_runtime |= !has_ibt;
+
if (md->attribute & EFI_MEMORY_XP)
pf |= _PAGE_NX;
@@ -414,6 +419,7 @@ void __init efi_runtime_update_mappings(void)
* exists, since it is intended to supersede EFI_PROPERTIES_TABLE.
*/
if (efi_enabled(EFI_MEM_ATTR)) {
+ efi_disable_ibt_for_runtime = false;
efi_memattr_apply_permissions(NULL, efi_update_mem_attr);
return;
}
diff --git a/arch/x86/platform/efi/fake_mem.c b/arch/x86/platform/efi/fake_mem.c
new file mode 100644
index 000000000000..41d57cad3d84
--- /dev/null
+++ b/arch/x86/platform/efi/fake_mem.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fake_mem.c
+ *
+ * Copyright (C) 2015 FUJITSU LIMITED
+ * Author: Taku Izumi <izumi.taku@jp.fujitsu.com>
+ *
+ * This code introduces new boot option named "efi_fake_mem"
+ * By specifying this parameter, you can add arbitrary attribute to
+ * specific memory range by updating original (firmware provided) EFI
+ * memmap.
+ */
+
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/types.h>
+#include <linux/sort.h>
+#include <asm/e820/api.h>
+#include <asm/efi.h>
+
+#define EFI_MAX_FAKEMEM CONFIG_EFI_MAX_FAKE_MEM
+
+static struct efi_mem_range efi_fake_mems[EFI_MAX_FAKEMEM];
+static int nr_fake_mem;
+
+static int __init cmp_fake_mem(const void *x1, const void *x2)
+{
+ const struct efi_mem_range *m1 = x1;
+ const struct efi_mem_range *m2 = x2;
+
+ if (m1->range.start < m2->range.start)
+ return -1;
+ if (m1->range.start > m2->range.start)
+ return 1;
+ return 0;
+}
+
+static void __init efi_fake_range(struct efi_mem_range *efi_range)
+{
+ struct efi_memory_map_data data = { 0 };
+ int new_nr_map = efi.memmap.nr_map;
+ efi_memory_desc_t *md;
+ void *new_memmap;
+
+ /* count up the number of EFI memory descriptor */
+ for_each_efi_memory_desc(md)
+ new_nr_map += efi_memmap_split_count(md, &efi_range->range);
+
+ /* allocate memory for new EFI memmap */
+ if (efi_memmap_alloc(new_nr_map, &data) != 0)
+ return;
+
+ /* create new EFI memmap */
+ new_memmap = early_memremap(data.phys_map, data.size);
+ if (!new_memmap) {
+ __efi_memmap_free(data.phys_map, data.size, data.flags);
+ return;
+ }
+
+ efi_memmap_insert(&efi.memmap, new_memmap, efi_range);
+
+ /* swap into new EFI memmap */
+ early_memunmap(new_memmap, data.size);
+
+ efi_memmap_install(&data);
+}
+
+void __init efi_fake_memmap(void)
+{
+ int i;
+
+ if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+ return;
+
+ for (i = 0; i < nr_fake_mem; i++)
+ efi_fake_range(&efi_fake_mems[i]);
+
+ /* print new EFI memmap */
+ efi_print_memmap();
+}
+
+static int __init setup_fake_mem(char *p)
+{
+ u64 start = 0, mem_size = 0, attribute = 0;
+ int i;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p != '\0') {
+ mem_size = memparse(p, &p);
+ if (*p == '@')
+ start = memparse(p+1, &p);
+ else
+ break;
+
+ if (*p == ':')
+ attribute = simple_strtoull(p+1, &p, 0);
+ else
+ break;
+
+ if (nr_fake_mem >= EFI_MAX_FAKEMEM)
+ break;
+
+ efi_fake_mems[nr_fake_mem].range.start = start;
+ efi_fake_mems[nr_fake_mem].range.end = start + mem_size - 1;
+ efi_fake_mems[nr_fake_mem].attribute = attribute;
+ nr_fake_mem++;
+
+ if (*p == ',')
+ p++;
+ }
+
+ sort(efi_fake_mems, nr_fake_mem, sizeof(struct efi_mem_range),
+ cmp_fake_mem, NULL);
+
+ for (i = 0; i < nr_fake_mem; i++)
+ pr_info("efi_fake_mem: add attr=0x%016llx to [mem 0x%016llx-0x%016llx]",
+ efi_fake_mems[i].attribute, efi_fake_mems[i].range.start,
+ efi_fake_mems[i].range.end);
+
+ return *p == '\0' ? 0 : -EINVAL;
+}
+
+early_param("efi_fake_mem", setup_fake_mem);
+
+void __init efi_fake_memmap_early(void)
+{
+ int i;
+
+ /*
+ * The late efi_fake_mem() call can handle all requests if
+ * EFI_MEMORY_SP support is disabled.
+ */
+ if (!efi_soft_reserve_enabled())
+ return;
+
+ if (!efi_enabled(EFI_MEMMAP) || !nr_fake_mem)
+ return;
+
+ /*
+ * Given that efi_fake_memmap() needs to perform memblock
+ * allocations it needs to run after e820__memblock_setup().
+ * However, if efi_fake_mem specifies EFI_MEMORY_SP for a given
+ * address range that potentially needs to mark the memory as
+ * reserved prior to e820__memblock_setup(). Update e820
+ * directly if EFI_MEMORY_SP is specified for an
+ * EFI_CONVENTIONAL_MEMORY descriptor.
+ */
+ for (i = 0; i < nr_fake_mem; i++) {
+ struct efi_mem_range *mem = &efi_fake_mems[i];
+ efi_memory_desc_t *md;
+ u64 m_start, m_end;
+
+ if ((mem->attribute & EFI_MEMORY_SP) == 0)
+ continue;
+
+ m_start = mem->range.start;
+ m_end = mem->range.end;
+ for_each_efi_memory_desc(md) {
+ u64 start, end, size;
+
+ if (md->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ start = md->phys_addr;
+ end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ if (m_start <= end && m_end >= start)
+ /* fake range overlaps descriptor */;
+ else
+ continue;
+
+ /*
+ * Trim the boundary of the e820 update to the
+ * descriptor in case the fake range overlaps
+ * !EFI_CONVENTIONAL_MEMORY
+ */
+ start = max(start, m_start);
+ end = min(end, m_end);
+ size = end - start + 1;
+
+ if (end <= start)
+ continue;
+
+ /*
+ * Ensure each efi_fake_mem instance results in
+ * a unique e820 resource
+ */
+ e820__range_remove(start, size, E820_TYPE_RAM, 1);
+ e820__range_add(start, size, E820_TYPE_SOFT_RESERVED);
+ e820__update_table(e820_table);
+ }
+ }
+}
diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c
new file mode 100644
index 000000000000..c69f8471e6d0
--- /dev/null
+++ b/arch/x86/platform/efi/memmap.c
@@ -0,0 +1,239 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common EFI memory map functions.
+ */
+
+#define pr_fmt(fmt) "efi: " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/efi.h>
+#include <linux/io.h>
+#include <asm/early_ioremap.h>
+#include <asm/efi.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+
+static phys_addr_t __init __efi_memmap_alloc_early(unsigned long size)
+{
+ return memblock_phys_alloc(size, SMP_CACHE_BYTES);
+}
+
+static phys_addr_t __init __efi_memmap_alloc_late(unsigned long size)
+{
+ unsigned int order = get_order(size);
+ struct page *p = alloc_pages(GFP_KERNEL, order);
+
+ if (!p)
+ return 0;
+
+ return PFN_PHYS(page_to_pfn(p));
+}
+
+void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags)
+{
+ if (flags & EFI_MEMMAP_MEMBLOCK) {
+ if (slab_is_available())
+ memblock_free_late(phys, size);
+ else
+ memblock_phys_free(phys, size);
+ } else if (flags & EFI_MEMMAP_SLAB) {
+ struct page *p = pfn_to_page(PHYS_PFN(phys));
+ unsigned int order = get_order(size);
+
+ free_pages((unsigned long) page_address(p), order);
+ }
+}
+
+/**
+ * efi_memmap_alloc - Allocate memory for the EFI memory map
+ * @num_entries: Number of entries in the allocated map.
+ * @data: efi memmap installation parameters
+ *
+ * Depending on whether mm_init() has already been invoked or not,
+ * either memblock or "normal" page allocation is used.
+ *
+ * Returns zero on success, a negative error code on failure.
+ */
+int __init efi_memmap_alloc(unsigned int num_entries,
+ struct efi_memory_map_data *data)
+{
+ /* Expect allocation parameters are zero initialized */
+ WARN_ON(data->phys_map || data->size);
+
+ data->size = num_entries * efi.memmap.desc_size;
+ data->desc_version = efi.memmap.desc_version;
+ data->desc_size = efi.memmap.desc_size;
+ data->flags &= ~(EFI_MEMMAP_SLAB | EFI_MEMMAP_MEMBLOCK);
+ data->flags |= efi.memmap.flags & EFI_MEMMAP_LATE;
+
+ if (slab_is_available()) {
+ data->flags |= EFI_MEMMAP_SLAB;
+ data->phys_map = __efi_memmap_alloc_late(data->size);
+ } else {
+ data->flags |= EFI_MEMMAP_MEMBLOCK;
+ data->phys_map = __efi_memmap_alloc_early(data->size);
+ }
+
+ if (!data->phys_map)
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * efi_memmap_install - Install a new EFI memory map in efi.memmap
+ * @ctx: map allocation parameters (address, size, flags)
+ *
+ * Unlike efi_memmap_init_*(), this function does not allow the caller
+ * to switch from early to late mappings. It simply uses the existing
+ * mapping function and installs the new memmap.
+ *
+ * Returns zero on success, a negative error code on failure.
+ */
+int __init efi_memmap_install(struct efi_memory_map_data *data)
+{
+ efi_memmap_unmap();
+
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
+ return __efi_memmap_init(data);
+}
+
+/**
+ * efi_memmap_split_count - Count number of additional EFI memmap entries
+ * @md: EFI memory descriptor to split
+ * @range: Address range (start, end) to split around
+ *
+ * Returns the number of additional EFI memmap entries required to
+ * accommodate @range.
+ */
+int __init efi_memmap_split_count(efi_memory_desc_t *md, struct range *range)
+{
+ u64 m_start, m_end;
+ u64 start, end;
+ int count = 0;
+
+ start = md->phys_addr;
+ end = start + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ /* modifying range */
+ m_start = range->start;
+ m_end = range->end;
+
+ if (m_start <= start) {
+ /* split into 2 parts */
+ if (start < m_end && m_end < end)
+ count++;
+ }
+
+ if (start < m_start && m_start < end) {
+ /* split into 3 parts */
+ if (m_end < end)
+ count += 2;
+ /* split into 2 parts */
+ if (end <= m_end)
+ count++;
+ }
+
+ return count;
+}
+
+/**
+ * efi_memmap_insert - Insert a memory region in an EFI memmap
+ * @old_memmap: The existing EFI memory map structure
+ * @buf: Address of buffer to store new map
+ * @mem: Memory map entry to insert
+ *
+ * It is suggested that you call efi_memmap_split_count() first
+ * to see how large @buf needs to be.
+ */
+void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf,
+ struct efi_mem_range *mem)
+{
+ u64 m_start, m_end, m_attr;
+ efi_memory_desc_t *md;
+ u64 start, end;
+ void *old, *new;
+
+ /* modifying range */
+ m_start = mem->range.start;
+ m_end = mem->range.end;
+ m_attr = mem->attribute;
+
+ /*
+ * The EFI memory map deals with regions in EFI_PAGE_SIZE
+ * units. Ensure that the region described by 'mem' is aligned
+ * correctly.
+ */
+ if (!IS_ALIGNED(m_start, EFI_PAGE_SIZE) ||
+ !IS_ALIGNED(m_end + 1, EFI_PAGE_SIZE)) {
+ WARN_ON(1);
+ return;
+ }
+
+ for (old = old_memmap->map, new = buf;
+ old < old_memmap->map_end;
+ old += old_memmap->desc_size, new += old_memmap->desc_size) {
+
+ /* copy original EFI memory descriptor */
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ start = md->phys_addr;
+ end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1;
+
+ if (m_start <= start && end <= m_end)
+ md->attribute |= m_attr;
+
+ if (m_start <= start &&
+ (start < m_end && m_end < end)) {
+ /* first part */
+ md->attribute |= m_attr;
+ md->num_pages = (m_end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ /* latter part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_end + 1;
+ md->num_pages = (end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ }
+
+ if ((start < m_start && m_start < end) && m_end < end) {
+ /* first part */
+ md->num_pages = (m_start - md->phys_addr) >>
+ EFI_PAGE_SHIFT;
+ /* middle part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->attribute |= m_attr;
+ md->phys_addr = m_start;
+ md->num_pages = (m_end - m_start + 1) >>
+ EFI_PAGE_SHIFT;
+ /* last part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_end + 1;
+ md->num_pages = (end - m_end) >>
+ EFI_PAGE_SHIFT;
+ }
+
+ if ((start < m_start && m_start < end) &&
+ (end <= m_end)) {
+ /* first part */
+ md->num_pages = (m_start - md->phys_addr) >>
+ EFI_PAGE_SHIFT;
+ /* latter part */
+ new += old_memmap->desc_size;
+ memcpy(new, old, old_memmap->desc_size);
+ md = new;
+ md->phys_addr = m_start;
+ md->num_pages = (end - md->phys_addr + 1) >>
+ EFI_PAGE_SHIFT;
+ md->attribute |= m_attr;
+ }
+ }
+}
diff --git a/arch/x86/platform/efi/runtime-map.c b/arch/x86/platform/efi/runtime-map.c
new file mode 100644
index 000000000000..bbee682ef8cd
--- /dev/null
+++ b/arch/x86/platform/efi/runtime-map.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Red Hat, Inc., Dave Young <dyoung@redhat.com>
+ */
+
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+
+#include <asm/efi.h>
+#include <asm/setup.h>
+
+struct efi_runtime_map_entry {
+ efi_memory_desc_t md;
+ struct kobject kobj; /* kobject for each entry */
+};
+
+static struct efi_runtime_map_entry **map_entries;
+
+struct map_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct efi_runtime_map_entry *entry, char *buf);
+};
+
+static inline struct map_attribute *to_map_attr(struct attribute *attr)
+{
+ return container_of(attr, struct map_attribute, attr);
+}
+
+static ssize_t type_show(struct efi_runtime_map_entry *entry, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "0x%x\n", entry->md.type);
+}
+
+#define EFI_RUNTIME_FIELD(var) entry->md.var
+
+#define EFI_RUNTIME_U64_ATTR_SHOW(name) \
+static ssize_t name##_show(struct efi_runtime_map_entry *entry, char *buf) \
+{ \
+ return snprintf(buf, PAGE_SIZE, "0x%llx\n", EFI_RUNTIME_FIELD(name)); \
+}
+
+EFI_RUNTIME_U64_ATTR_SHOW(phys_addr);
+EFI_RUNTIME_U64_ATTR_SHOW(virt_addr);
+EFI_RUNTIME_U64_ATTR_SHOW(num_pages);
+EFI_RUNTIME_U64_ATTR_SHOW(attribute);
+
+static inline struct efi_runtime_map_entry *to_map_entry(struct kobject *kobj)
+{
+ return container_of(kobj, struct efi_runtime_map_entry, kobj);
+}
+
+static ssize_t map_attr_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct efi_runtime_map_entry *entry = to_map_entry(kobj);
+ struct map_attribute *map_attr = to_map_attr(attr);
+
+ return map_attr->show(entry, buf);
+}
+
+static struct map_attribute map_type_attr = __ATTR_RO_MODE(type, 0400);
+static struct map_attribute map_phys_addr_attr = __ATTR_RO_MODE(phys_addr, 0400);
+static struct map_attribute map_virt_addr_attr = __ATTR_RO_MODE(virt_addr, 0400);
+static struct map_attribute map_num_pages_attr = __ATTR_RO_MODE(num_pages, 0400);
+static struct map_attribute map_attribute_attr = __ATTR_RO_MODE(attribute, 0400);
+
+/*
+ * These are default attributes that are added for every memmap entry.
+ */
+static struct attribute *def_attrs[] = {
+ &map_type_attr.attr,
+ &map_phys_addr_attr.attr,
+ &map_virt_addr_attr.attr,
+ &map_num_pages_attr.attr,
+ &map_attribute_attr.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(def);
+
+static const struct sysfs_ops map_attr_ops = {
+ .show = map_attr_show,
+};
+
+static void map_release(struct kobject *kobj)
+{
+ struct efi_runtime_map_entry *entry;
+
+ entry = to_map_entry(kobj);
+ kfree(entry);
+}
+
+static struct kobj_type __refdata map_ktype = {
+ .sysfs_ops = &map_attr_ops,
+ .default_groups = def_groups,
+ .release = map_release,
+};
+
+static struct kset *map_kset;
+
+static struct efi_runtime_map_entry *
+add_sysfs_runtime_map_entry(struct kobject *kobj, int nr,
+ efi_memory_desc_t *md)
+{
+ int ret;
+ struct efi_runtime_map_entry *entry;
+
+ if (!map_kset) {
+ map_kset = kset_create_and_add("runtime-map", NULL, kobj);
+ if (!map_kset)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ kset_unregister(map_kset);
+ map_kset = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(&entry->md, md, sizeof(efi_memory_desc_t));
+
+ kobject_init(&entry->kobj, &map_ktype);
+ entry->kobj.kset = map_kset;
+ ret = kobject_add(&entry->kobj, NULL, "%d", nr);
+ if (ret) {
+ kobject_put(&entry->kobj);
+ kset_unregister(map_kset);
+ map_kset = NULL;
+ return ERR_PTR(ret);
+ }
+
+ return entry;
+}
+
+int efi_get_runtime_map_size(void)
+{
+ return efi.memmap.nr_map * efi.memmap.desc_size;
+}
+
+int efi_get_runtime_map_desc_size(void)
+{
+ return efi.memmap.desc_size;
+}
+
+int efi_runtime_map_copy(void *buf, size_t bufsz)
+{
+ size_t sz = efi_get_runtime_map_size();
+
+ if (sz > bufsz)
+ sz = bufsz;
+
+ memcpy(buf, efi.memmap.map, sz);
+ return 0;
+}
+
+static int __init efi_runtime_map_init(void)
+{
+ int i, j, ret = 0;
+ struct efi_runtime_map_entry *entry;
+ efi_memory_desc_t *md;
+
+ if (!efi_enabled(EFI_MEMMAP) || !efi_kobj)
+ return 0;
+
+ map_entries = kcalloc(efi.memmap.nr_map, sizeof(entry), GFP_KERNEL);
+ if (!map_entries) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ i = 0;
+ for_each_efi_memory_desc(md) {
+ entry = add_sysfs_runtime_map_entry(efi_kobj, i, md);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto out_add_entry;
+ }
+ *(map_entries + i++) = entry;
+ }
+
+ return 0;
+out_add_entry:
+ for (j = i - 1; j >= 0; j--) {
+ entry = *(map_entries + j);
+ kobject_put(&entry->kobj);
+ }
+out:
+ return ret;
+}
+subsys_initcall_sync(efi_runtime_map_init);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 994a229cb79f..68244a3422d1 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -183,13 +183,12 @@ err_sysfs:
return r;
}
-static int xo15_sci_remove(struct acpi_device *device)
+static void xo15_sci_remove(struct acpi_device *device)
{
acpi_disable_gpe(NULL, xo15_sci_gpe);
acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
cancel_work_sync(&sci_work);
sysfs_remove_file(&device->dev.kobj, &lid_wake_on_close_attr.attr);
- return 0;
}
#ifdef CONFIG_PM_SLEEP
diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c
index ed0442e35434..00a92cb2c814 100644
--- a/arch/x86/platform/pvh/enlighten.c
+++ b/arch/x86/platform/pvh/enlighten.c
@@ -86,7 +86,7 @@ static void __init init_pvh_bootparams(bool xen_guest)
}
/*
- * See Documentation/x86/boot.rst.
+ * See Documentation/arch/x86/boot.rst.
*
* Version 2.12 supports Xen entry point but we will use default x86/PC
* environment (i.e. hardware_subarch 0).
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 7fe564eaf228..c4365a05ab83 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,7 +50,7 @@
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
cld
lgdt (_pa(gdt))
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 1a536a187d74..ee21d6a36a80 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -166,10 +166,9 @@ static struct irq_domain *uv_get_irq_domain(void)
if (!fn)
goto out;
- uv_domain = irq_domain_create_tree(fn, &uv_domain_ops, NULL);
- if (uv_domain)
- uv_domain->parent = x86_vector_domain;
- else
+ uv_domain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0, fn,
+ &uv_domain_ops, NULL);
+ if (!uv_domain)
irq_domain_free_fwnode(fn);
out:
mutex_unlock(&uv_lock);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index bb176c72891c..7a4d5e911415 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -23,6 +23,7 @@
#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
+#include <asm/cacheinfo.h>
#include <asm/mmu_context.h>
#include <asm/cpu_device_id.h>
#include <asm/microcode.h>
@@ -261,7 +262,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
do_fpu_end();
tsc_verify_tsc_adjust(true);
x86_platform.restore_sched_clock_state();
- mtrr_bp_restore();
+ cache_bp_restore();
perf_restore_debug_store();
c = &cpu_data(smp_processor_id());
@@ -287,7 +288,7 @@ EXPORT_SYMBOL(restore_processor_state);
#endif
#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HOTPLUG_CPU)
-static void resume_play_dead(void)
+static void __noreturn resume_play_dead(void)
{
play_dead_common();
tboot_shutdown(TB_SHUTDOWN_WFS);
@@ -513,15 +514,23 @@ static int pm_cpu_check(const struct x86_cpu_id *c)
static void pm_save_spec_msr(void)
{
- u32 spec_msr_id[] = {
- MSR_IA32_SPEC_CTRL,
- MSR_IA32_TSX_CTRL,
- MSR_TSX_FORCE_ABORT,
- MSR_IA32_MCU_OPT_CTRL,
- MSR_AMD64_LS_CFG,
+ struct msr_enumeration {
+ u32 msr_no;
+ u32 feature;
+ } msr_enum[] = {
+ { MSR_IA32_SPEC_CTRL, X86_FEATURE_MSR_SPEC_CTRL },
+ { MSR_IA32_TSX_CTRL, X86_FEATURE_MSR_TSX_CTRL },
+ { MSR_TSX_FORCE_ABORT, X86_FEATURE_TSX_FORCE_ABORT },
+ { MSR_IA32_MCU_OPT_CTRL, X86_FEATURE_SRBDS_CTRL },
+ { MSR_AMD64_LS_CFG, X86_FEATURE_LS_CFG_SSBD },
+ { MSR_AMD64_DE_CFG, X86_FEATURE_LFENCE_RDTSC },
};
+ int i;
- msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id));
+ for (i = 0; i < ARRAY_SIZE(msr_enum); i++) {
+ if (boot_cpu_has(msr_enum[i].feature))
+ msr_build_context(&msr_enum[i].msr_no, 1);
+ }
}
static int pm_check_save_msr(void)
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index e94e0050a583..6f955eb1e163 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -159,7 +159,7 @@ int relocate_restore_code(void)
if (!relocated_restore_code)
return -ENOMEM;
- memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
+ __memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE);
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3_pa()) +
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 17f09dc26381..42abd6af1198 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -14,6 +14,11 @@ $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
CFLAGS_sha256.o := -D__DISABLE_EXPORTS
+# When profile-guided optimization is enabled, llvm emits two different
+# overlapping text sections, which is not supported by kexec. Remove profile
+# optimization flags.
+KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%,$(KBUILD_CFLAGS))
+
# When linking purgatory.ro with -r unresolved symbols are not checked,
# also link a purgatory.chk binary without -r to check for unresolved symbols.
PURGATORY_LDFLAGS := -e purgatory_start -z nodefaultlib
@@ -69,8 +74,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS)
CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_string.o += $(PURGATORY_CFLAGS)
-AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2
-AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2
+asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x))
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 41d7669a97ad..af565816d2ba 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -200,14 +200,18 @@ static void __init set_real_mode_permissions(void)
set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
}
-static int __init init_real_mode(void)
+void __init init_real_mode(void)
{
if (!real_mode_header)
panic("Real mode trampoline was not allocated");
setup_real_mode();
set_real_mode_permissions();
+}
+static int __init do_init_real_mode(void)
+{
+ x86_platform.realmode_init();
return 0;
}
-early_initcall(init_real_mode);
+early_initcall(do_init_real_mode);
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index bddfc9a46645..90e820ac9771 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
PHONY += posttest
-ifeq ($(KBUILD_VERBOSE),1)
+ifneq ($(findstring 1, $(KBUILD_VERBOSE)),)
posttest_verbose = -v
else
posttest_verbose =
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 2925074b9a58..d30949e25ebd 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -406,7 +406,7 @@ static void read_ehdr(FILE *fp)
if (ehdr.e_version != EV_CURRENT)
die("Unknown ELF version\n");
if (ehdr.e_ehsize != sizeof(Elf_Ehdr))
- die("Bad Elf header size\n");
+ die("Bad ELF header size\n");
if (ehdr.e_phentsize != sizeof(Elf_Phdr))
die("Bad program header entry\n");
if (ehdr.e_shentsize != sizeof(Elf_Shdr))
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 3d5cd2e57820..ee89f6bb9242 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -48,4 +48,4 @@ include/generated/user_constants.h: $(obj)/user-offsets.s FORCE
UNPROFILE_OBJS := stub_segv.o
CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index dcaf3b38a9e0..6523eb7c3bd1 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -201,10 +201,6 @@ typedef struct user_i387_struct elf_fpregset_t;
struct task_struct;
-extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
-
-#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu)
-
#define ELF_EXEC_PAGESIZE 4096
#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c
index 48a3eb09d951..650cdbbdaf45 100644
--- a/arch/x86/um/elfcore.c
+++ b/arch/x86/um/elfcore.c
@@ -7,7 +7,7 @@
#include <asm/elf.h>
-Elf32_Half elf_core_extra_phdrs(void)
+Elf32_Half elf_core_extra_phdrs(struct coredump_params *cprm)
{
return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0;
}
@@ -60,7 +60,7 @@ int elf_core_write_extra_data(struct coredump_params *cprm)
return 1;
}
-size_t elf_core_extra_data_size(void)
+size_t elf_core_extra_data_size(struct coredump_params *cprm)
{
if ( vsyscall_ehdr ) {
const struct elfhdr *const ehdrp =
diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c
index cafd01f730da..29b2203bc82c 100644
--- a/arch/x86/um/mem_32.c
+++ b/arch/x86/um/mem_32.c
@@ -16,7 +16,7 @@ static int __init gate_vma_init(void)
vma_init(&gate_vma, NULL);
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
- gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
+ vm_flags_init(&gate_vma, VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC);
gate_vma.vm_page_prot = PAGE_READONLY;
return 0;
diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile
index 253bfb8cb702..ae169125d03f 100644
--- a/arch/x86/um/os-Linux/Makefile
+++ b/arch/x86/um/os-Linux/Makefile
@@ -10,4 +10,4 @@ obj-$(CONFIG_64BIT) += prctl.o
USER_OBJS := $(obj-y)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 4c6c2be0c899..38fa894b65d0 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -89,19 +89,19 @@ static inline void remap_stack_and_trap(void)
"addl %4,%%ebx ; movl %%eax, (%%ebx) ;"
"int $3"
: :
- "g" (~(UM_KERN_PAGE_SIZE - 1)),
+ "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)),
"g" (STUB_MMAP_NR),
"g" (UML_STUB_FIELD_FD),
"g" (UML_STUB_FIELD_OFFSET),
"g" (UML_STUB_FIELD_CHILD_ERR),
- "c" (UM_KERN_PAGE_SIZE),
+ "c" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE),
"d" (PROT_READ | PROT_WRITE),
"S" (MAP_FIXED | MAP_SHARED)
:
"memory");
}
-static __always_inline void *get_stub_page(void)
+static __always_inline void *get_stub_data(void)
{
unsigned long ret;
@@ -109,7 +109,7 @@ static __always_inline void *get_stub_page(void)
"movl %%esp,%0 ;"
"andl %1,%0"
: "=a" (ret)
- : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+ : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)));
return (void *)ret;
}
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 92ea1670cf1c..2de1c8f88173 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -98,18 +98,18 @@ static inline void remap_stack_and_trap(void)
"int3"
: :
"g" (STUB_MMAP_NR),
- "g" (~(UM_KERN_PAGE_SIZE - 1)),
+ "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)),
"g" (MAP_FIXED | MAP_SHARED),
"g" (UML_STUB_FIELD_FD),
"g" (UML_STUB_FIELD_OFFSET),
"g" (UML_STUB_FIELD_CHILD_ERR),
- "S" (UM_KERN_PAGE_SIZE),
+ "S" (STUB_DATA_PAGES * UM_KERN_PAGE_SIZE),
"d" (PROT_READ | PROT_WRITE)
:
__syscall_clobber, "r10", "r8", "r9");
}
-static __always_inline void *get_stub_page(void)
+static __always_inline void *get_stub_data(void)
{
unsigned long ret;
@@ -117,7 +117,7 @@ static __always_inline void *get_stub_page(void)
"movq %%rsp,%0 ;"
"andq %1,%0"
: "=a" (ret)
- : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+ : "g" (~(STUB_DATA_PAGES * UM_KERN_PAGE_SIZE - 1)));
return (void *)ret;
}
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index f7eefba034f9..040668b989b5 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -11,7 +11,7 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
- struct faultinfo *f = get_stub_page();
+ struct faultinfo *f = get_stub_data();
ucontext_t *uc = p;
GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext);
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 6fbe97c52c99..6825e146a62f 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -61,7 +61,7 @@ CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage
#
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
- $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ $(CC_FLAGS_LTO) $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c
index 2112b8d14668..ff0f3b4b6c45 100644
--- a/arch/x86/um/vdso/um_vdso.c
+++ b/arch/x86/um/vdso/um_vdso.c
@@ -17,8 +17,10 @@ int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
+ asm("syscall"
+ : "=a" (ret)
+ : "0" (__NR_clock_gettime), "D" (clock), "S" (ts)
+ : "rcx", "r11", "memory");
return ret;
}
@@ -29,8 +31,10 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+ asm("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday), "D" (tv), "S" (tz)
+ : "rcx", "r11", "memory");
return ret;
}
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3c5b52fbe4a7..a9ec8c9f5c5d 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_PV_DOM0) += vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index f82857e48815..093b78c8bbec 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -23,6 +23,7 @@
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
+#include <linux/kstrtox.h>
#include <linux/memblock.h>
#include <linux/export.h>
#include <linux/mm.h>
@@ -32,6 +33,7 @@
#include <linux/edd.h>
#include <linux/reboot.h>
#include <linux/virtio_anchor.h>
+#include <linux/stackprotector.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -64,7 +66,6 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
#include <asm/mwait.h>
@@ -113,7 +114,7 @@ static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
static int __init parse_xen_msr_safe(char *str)
{
if (str)
- return strtobool(str, &xen_msr_safe);
+ return kstrtobool(str, &xen_msr_safe);
return -EINVAL;
}
early_param("xen_msr_safe", parse_xen_msr_safe);
@@ -275,6 +276,7 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_LKGS);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
@@ -1067,7 +1069,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
.write_cr4 = xen_write_cr4,
- .wbinvd = native_wbinvd,
+ .wbinvd = pv_native_wbinvd,
.read_msr = xen_read_msr,
.write_msr = xen_write_msr,
@@ -1209,7 +1211,7 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
pv_ops.cpu.load_gdt = xen_load_gdt;
@@ -1265,6 +1267,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_vcpu_info_reset(0);
x86_platform.get_nmi_reason = xen_get_nmi_reason;
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_select = x86_init_noop;
@@ -1386,7 +1390,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
- xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_init_vga(info, xen_start_info->console.dom0.info_size,
+ &boot_params.screen_info);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index bcae606bbc5c..ada3868c02c2 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -43,6 +43,19 @@ void __init xen_pvh_init(struct boot_params *boot_params)
x86_init.oem.banner = xen_banner;
xen_efi_init(boot_params);
+
+ if (xen_initial_domain()) {
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_dom0_console,
+ };
+ int ret = HYPERVISOR_platform_op(&op);
+
+ if (ret > 0)
+ xen_init_vga(&op.u.dom0_console,
+ min(ret * sizeof(char),
+ sizeof(op.u.dom0_console)),
+ &boot_params->screen_info);
+ }
}
void __init mem_map_via_hcall(struct boot_params *boot_params_p)
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 06c3c2fb4b06..6092fea7d651 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void xen_safe_halt(void)
+static noinstr void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ee29fb558f2e..b3b8d289b9ab 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -885,14 +885,7 @@ void xen_mm_unpin_all(void)
spin_unlock(&pgd_lock);
}
-static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- spin_lock(&next->page_table_lock);
- xen_pgd_pin(next);
- spin_unlock(&next->page_table_lock);
-}
-
-static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_enter_mmap(struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -2153,8 +2146,7 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
+ .enter_mmap = xen_enter_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 58db86f7b384..9bdc3b656b2c 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -134,11 +134,6 @@ static inline unsigned p2m_mid_index(unsigned long pfn)
return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 4f4309500559..c2be3efb2ba0 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/sched.h>
+#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
@@ -85,7 +86,7 @@ static void __init xen_parse_512gb(void)
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
- else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
@@ -933,12 +934,8 @@ void xen_enable_syscall(void)
static void __init xen_pvmmu_arch_setup(void)
{
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
-
if (register_callback(CALLBACKTYPE_event,
xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c3e1f9a7d43a..4b0d6fff88de 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
void xen_smp_intr_free(unsigned int cpu)
{
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
per_cpu(xen_resched_irq, cpu).irq = -1;
- kfree(per_cpu(xen_resched_irq, cpu).name);
- per_cpu(xen_resched_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
per_cpu(xen_callfunc_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfunc_irq, cpu).name);
- per_cpu(xen_callfunc_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
per_cpu(xen_debug_irq, cpu).irq = -1;
- kfree(per_cpu(xen_debug_irq, cpu).name);
- per_cpu(xen_debug_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
NULL);
per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
- per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
}
}
@@ -65,6 +65,7 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
@@ -74,9 +75,9 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu).irq = rc;
- per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
@@ -86,10 +87,10 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu).irq = rc;
- per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
IRQF_PERCPU | IRQF_NOBALANCING,
@@ -97,10 +98,10 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu).irq = rc;
- per_cpu(xen_debug_irq, cpu).name = debug_name;
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
@@ -110,7 +111,6 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
- per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
return 0;
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index bd02f9d50107..22fb982ff971 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,6 +21,8 @@ void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(const struct cpumask *mask);
void xen_smp_send_call_function_single_ipi(int cpu);
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
+
struct xen_common_irq {
int irq;
char *name;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 480be82e9b7b..a9cf8c8fa074 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -97,18 +97,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
void xen_smp_intr_free_pv(unsigned int cpu)
{
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
if (per_cpu(xen_irq_work, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
per_cpu(xen_irq_work, cpu).irq = -1;
- kfree(per_cpu(xen_irq_work, cpu).name);
- per_cpu(xen_irq_work, cpu).name = NULL;
}
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
- kfree(per_cpu(xen_pmu_irq, cpu).name);
- per_cpu(xen_pmu_irq, cpu).name = NULL;
}
}
@@ -118,6 +118,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
char *callfunc_name, *pmu_name;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
@@ -127,10 +128,10 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_irq_work, cpu).irq = rc;
- per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
@@ -138,7 +139,6 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
- per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0;
@@ -381,21 +381,12 @@ static void xen_pv_cpu_die(unsigned int cpu)
}
}
-static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
- cpu_bringup();
- /*
- * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
- * clears certain data that the cpu_idle loop (which called us
- * and that we return from) expects. The only way to get that
- * data back is to call:
- */
- tick_nohz_idle_enter();
- tick_nohz_idle_stop_tick_protected();
-
- cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
+ xen_cpu_bringup_again((unsigned long)task_pt_regs(current));
+ BUG();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -409,7 +400,7 @@ static void xen_pv_cpu_die(unsigned int cpu)
BUG();
}
-static void xen_pv_play_dead(void)
+static void __noreturn xen_pv_play_dead(void)
{
BUG();
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 043c73dfd2c9..5c6fc16e4b92 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu)
cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ per_cpu(irq_name, cpu) = name;
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
@@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu)
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
- per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu)
if (!xen_pvspin)
return;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
/*
* When booting the kernel with 'mitigations=auto,nosmt', the secondary
* CPUs are not activated, and lock_kicker_irq is not initialized.
@@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu)
unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
- kfree(per_cpu(irq_name, cpu));
- per_cpu(irq_name, cpu) = NULL;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 9ef0a5cca96e..b74ac2562cfb 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <asm/xen/cpuid.h>
#include <xen/events.h>
#include <xen/features.h>
@@ -60,9 +61,17 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
{
- return xen_clocksource_read() - xen_sched_clock_offset;
+ struct pvclock_vcpu_time_info *src;
+ u64 ret;
+
+ preempt_disable_notrace();
+ src = &__this_cpu_read(xen_vcpu)->time;
+ ret = pvclock_clocksource_read_nowd(src);
+ ret -= xen_sched_clock_offset;
+ preempt_enable_notrace();
+ return ret;
}
static void xen_read_wallclock(struct timespec64 *ts)
@@ -474,15 +483,47 @@ static void xen_setup_vsyscall_time_info(void)
xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
+/*
+ * Check if it is possible to safely use the tsc as a clocksource. This is
+ * only true if the hypervisor notifies the guest that its tsc is invariant,
+ * the tsc is stable, and the tsc instruction will never be emulated.
+ */
+static int __init xen_tsc_safe_clocksource(void)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
+ return 0;
+
+ if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
+ return 0;
+
+ if (check_tsc_unstable())
+ return 0;
+
+ /* Leaf 4, sub-leaf 0 (0x40000x03) */
+ cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
+
+ return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
+}
+
static void __init xen_time_init(void)
{
struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec64 tp;
- /* As Dom0 is never moved, no penalty on using TSC there */
+ /*
+ * As Dom0 is never moved, no penalty on using TSC there.
+ *
+ * If it is possible for the guest to determine that the tsc is a safe
+ * clocksource, then set xen_clocksource rating below that of the tsc
+ * so that the system prefers tsc instead.
+ */
if (xen_initial_domain())
xen_clocksource.rating = 275;
+ else if (xen_tsc_safe_clocksource())
+ xen_clocksource.rating = 299;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index 14ea32e734d5..d97adab8420f 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -9,10 +9,9 @@
#include "xen-ops.h"
-void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size,
+ struct screen_info *screen_info)
{
- struct screen_info *screen_info = &boot_params.screen_info;
-
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b4fdf6b9542..08f1ceb9eb81 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ENDBR
pop %rcx
pop %r11
@@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* rsp->rax }
*/
SYM_CODE_START(xen_iret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
pushq $0
jmp hypercall_iret
@@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
@@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..643d02900fbb 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,11 +45,11 @@ SYM_CODE_END(hypercall_page)
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
cld
- mov initial_stack(%rip), %rsp
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
/* Set up %gs.
*
@@ -71,11 +71,18 @@ SYM_CODE_END(startup_xen)
#ifdef CONFIG_XEN_PV_SMP
.pushsection .text
SYM_CODE_START(asm_cpu_bringup_and_idle)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
call cpu_bringup_and_idle
SYM_CODE_END(asm_cpu_bringup_and_idle)
+
+SYM_CODE_START(xen_cpu_bringup_again)
+ UNWIND_HINT_FUNC
+ mov %rdi, %rsp
+ UNWIND_HINT_REGS
+ call cpu_bringup_and_idle
+SYM_CODE_END(xen_cpu_bringup_again)
.popsection
#endif
#endif
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a8bb972193d..a10903785a33 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -108,11 +108,12 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_PV_DOM0
-void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size,
+ struct screen_info *);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
- size_t size)
+ size_t size, struct screen_info *si)
{
}
#endif