diff options
Diffstat (limited to 'arch/x86/entry')
-rw-r--r-- | arch/x86/entry/calling.h | 61 | ||||
-rw-r--r-- | arch/x86/entry/common.c | 13 | ||||
-rw-r--r-- | arch/x86/entry/entry_32.S | 339 | ||||
-rw-r--r-- | arch/x86/entry/entry_64.S | 170 | ||||
-rw-r--r-- | arch/x86/entry/syscall_32.c | 8 | ||||
-rw-r--r-- | arch/x86/entry/syscall_64.c | 37 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_32.tbl | 12 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscalltbl.sh | 35 | ||||
-rw-r--r-- | arch/x86/entry/vdso/vdso32-setup.c | 1 |
9 files changed, 427 insertions, 249 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index d6f2e29be3e2..b3f121478738 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -98,13 +98,6 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 .macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0 - /* - * Push registers and sanitize registers of values that a - * speculation attack might otherwise want to exploit. The - * lower registers are likely clobbered well before they - * could be put to use in a speculative execution gadget. - * Interleave XOR with PUSH for better uop scheduling: - */ .if \save_ret pushq %rsi /* pt_regs->si */ movq 8(%rsp), %rsi /* temporarily store the return address in %rsi */ @@ -114,34 +107,43 @@ For 32-bit we have the following conventions - kernel is built with pushq %rsi /* pt_regs->si */ .endif pushq \rdx /* pt_regs->dx */ - xorl %edx, %edx /* nospec dx */ pushq %rcx /* pt_regs->cx */ - xorl %ecx, %ecx /* nospec cx */ pushq \rax /* pt_regs->ax */ pushq %r8 /* pt_regs->r8 */ - xorl %r8d, %r8d /* nospec r8 */ pushq %r9 /* pt_regs->r9 */ - xorl %r9d, %r9d /* nospec r9 */ pushq %r10 /* pt_regs->r10 */ - xorl %r10d, %r10d /* nospec r10 */ pushq %r11 /* pt_regs->r11 */ - xorl %r11d, %r11d /* nospec r11*/ pushq %rbx /* pt_regs->rbx */ - xorl %ebx, %ebx /* nospec rbx*/ pushq %rbp /* pt_regs->rbp */ - xorl %ebp, %ebp /* nospec rbp*/ pushq %r12 /* pt_regs->r12 */ - xorl %r12d, %r12d /* nospec r12*/ pushq %r13 /* pt_regs->r13 */ - xorl %r13d, %r13d /* nospec r13*/ pushq %r14 /* pt_regs->r14 */ - xorl %r14d, %r14d /* nospec r14*/ pushq %r15 /* pt_regs->r15 */ - xorl %r15d, %r15d /* nospec r15*/ UNWIND_HINT_REGS + .if \save_ret pushq %rsi /* return address on top of stack */ .endif + + /* + * Sanitize registers of values that a speculation attack might + * otherwise want to exploit. The lower registers are likely clobbered + * well before they could be put to use in a speculative execution + * gadget. + */ + xorl %edx, %edx /* nospec dx */ + xorl %ecx, %ecx /* nospec cx */ + xorl %r8d, %r8d /* nospec r8 */ + xorl %r9d, %r9d /* nospec r9 */ + xorl %r10d, %r10d /* nospec r10 */ + xorl %r11d, %r11d /* nospec r11 */ + xorl %ebx, %ebx /* nospec rbx */ + xorl %ebp, %ebp /* nospec rbp */ + xorl %r12d, %r12d /* nospec r12 */ + xorl %r13d, %r13d /* nospec r13 */ + xorl %r14d, %r14d /* nospec r14 */ + xorl %r15d, %r15d /* nospec r15 */ + .endm .macro POP_REGS pop_rdi=1 skip_r11rcx=0 @@ -172,21 +174,6 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts - * the original rbp. - */ -.macro ENCODE_FRAME_POINTER ptregs_offset=0 -#ifdef CONFIG_FRAME_POINTER - leaq 1+\ptregs_offset(%rsp), %rbp -#endif -.endm - #ifdef CONFIG_PAGE_TABLE_ISOLATION /* @@ -375,3 +362,9 @@ For 32-bit we have the following conventions - kernel is built with .Lafter_call_\@: #endif .endm + +#ifdef CONFIG_PARAVIRT_XXL +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg +#else +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg +#endif diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2418804e66b4..73c216d73417 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -290,15 +290,16 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); - /* - * NB: Native and x32 syscalls are dispatched from the same - * table. The only functional difference is the x32 bit in - * regs->orig_ax, which changes the behavior of some syscalls. - */ - nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); +#ifdef CONFIG_X86_X32_ABI + } else if (likely((nr & __X32_SYSCALL_BIT) && + (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) { + nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT, + X32_NR_syscalls); + regs->ax = x32_sys_call_table[nr](regs); +#endif } syscall_return_slowpath(regs); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f49e11669271..e5530e74b738 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,7 +67,6 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -173,7 +172,7 @@ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI .if \no_user_check == 0 /* coming from usermode? */ - testl $SEGMENT_RPL_MASK, PT_CS(%esp) + testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp) jz .Lend_\@ .endif /* On user-cr3? */ @@ -203,10 +202,126 @@ .Lend_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_KERNEL (1 << 29) +#define CS_FROM_ESPFIX (1 << 28) + +.macro FIXUP_FRAME + /* + * The high bits of the CS dword (__csh) are used for CS_FROM_*. + * Clear them in case hardware didn't do this for us. + */ + andl $0x0000ffff, 4*4(%esp) + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 5*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ +#endif + testl $USER_SEGMENT_RPL_MASK, 4*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ + + orl $CS_FROM_KERNEL, 4*4(%esp) + + /* + * When we're here from kernel mode; the (exception) stack looks like: + * + * 6*4(%esp) - <previous context> + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs + * + * Lets build a 5 entry IRET frame after that, such that struct pt_regs + * is complete and in particular regs->sp is correct. This gives us + * the original 6 enties as gap: + * + * 14*4(%esp) - <previous context> + * 13*4(%esp) - gap / flags + * 12*4(%esp) - gap / cs + * 11*4(%esp) - gap / ip + * 10*4(%esp) - gap / orig_eax + * 9*4(%esp) - gap / gs / function + * 8*4(%esp) - gap / fs + * 7*4(%esp) - ss + * 6*4(%esp) - sp + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs + */ + + pushl %ss # ss + pushl %esp # sp (points at ss) + addl $7*4, (%esp) # point sp back at the previous context + pushl 7*4(%esp) # flags + pushl 7*4(%esp) # cs + pushl 7*4(%esp) # ip + pushl 7*4(%esp) # orig_eax + pushl 7*4(%esp) # gs / function + pushl 7*4(%esp) # fs +.Lfrom_usermode_no_fixup_\@: +.endm + +.macro IRET_FRAME + /* + * We're called with %ds, %es, %fs, and %gs from the interrupted + * frame, so we shouldn't use them. Also, we may be in ESPFIX + * mode and therefore have a nonzero SS base and an offset ESP, + * so any attempt to access the stack needs to use SS. (except for + * accesses through %esp, which automatically use SS.) + */ + testl $CS_FROM_KERNEL, 1*4(%esp) + jz .Lfinished_frame_\@ + + /* + * Reconstruct the 3 entry IRET frame right after the (modified) + * regs->sp without lowering %esp in between, such that an NMI in the + * middle doesn't scribble our stack. + */ + pushl %eax + pushl %ecx + movl 5*4(%esp), %eax # (modified) regs->sp + + movl 4*4(%esp), %ecx # flags + movl %ecx, %ss:-1*4(%eax) + + movl 3*4(%esp), %ecx # cs + andl $0x0000ffff, %ecx + movl %ecx, %ss:-2*4(%eax) + + movl 2*4(%esp), %ecx # ip + movl %ecx, %ss:-3*4(%eax) + + movl 1*4(%esp), %ecx # eax + movl %ecx, %ss:-4*4(%eax) + + popl %ecx + lea -4*4(%eax), %esp + popl %eax +.Lfinished_frame_\@: +.endm + +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0 cld +.if \skip_gs == 0 PUSH_GS +.endif pushl %fs + + pushl %eax + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs +.if \unwind_espfix > 0 + UNWIND_ESPFIX_STACK +.endif + popl %eax + + FIXUP_FRAME pushl %es pushl %ds pushl \pt_regs_ax @@ -219,19 +334,17 @@ movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs +.if \skip_gs == 0 SET_KERNEL_GS %edx - +.endif /* Switch to kernel stack if necessary */ .if \switch_stacks > 0 SWITCH_TO_KERNEL_STACK .endif - .endm -.macro SAVE_ALL_NMI cr3_reg:req - SAVE_ALL +.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0 + SAVE_ALL unwind_espfix=\unwind_espfix BUG_IF_WRONG_CR3 @@ -247,22 +360,6 @@ .Lend_\@: .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just clearing the MSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the - * original rbp. - */ -.macro ENCODE_FRAME_POINTER -#ifdef CONFIG_FRAME_POINTER - mov %esp, %ebp - andl $0x7fffffff, %ebp -#endif -.endm - .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -279,6 +376,7 @@ 2: popl %es 3: popl %fs POP_GS \pop + IRET_FRAME .pushsection .fixup, "ax" 4: movl $0, (%esp) jmp 1b @@ -317,7 +415,8 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -375,9 +474,6 @@ * switch to it before we do any copying. */ -#define CS_FROM_ENTRY_STACK (1 << 31) -#define CS_FROM_USER_CR3 (1 << 30) - .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV @@ -391,13 +487,6 @@ * that register for the time this macro runs */ - /* - * The high bits of the CS dword (__csh) are used for - * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case - * hardware didn't do this for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -755,7 +844,7 @@ ret_from_intr: andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -765,18 +854,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception) -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp restore_all_kernel -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1027,6 +1104,15 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lno_preempt + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lno_preempt + call preempt_schedule_irq +.Lno_preempt: +#endif TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3 @@ -1062,30 +1148,43 @@ ENDPROC(entry_INT80_32) * We can't call C functions using the ESPFIX stack. This code reads * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. + * + * We might be on user CR3 here, so percpu data is not mapped and we can't + * access the GDT through the percpu segment. Instead, use SGDT to find + * the cpu_entry_area alias of the GDT. */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + pushl %ecx + subl $2*4, %esp + sgdt (%esp) + movl 2(%esp), %ecx /* GDT address */ + /* + * Careful: ECX is a linear pointer, so we need to force base + * zero. %cs is the only known-linear segment we have right now. + */ + mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */ + mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */ shl $16, %eax + addl $2*4, %esp + popl %ecx addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm + .macro UNWIND_ESPFIX_STACK + /* It's safe to clobber %eax, all other regs need to be preserved */ #ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + jne .Lno_fixup_\@ /* switch to normal stack */ FIXUP_ESPFIX_STACK -27: +.Lno_fixup_\@: #endif .endm @@ -1275,11 +1374,6 @@ END(spurious_interrupt_bug) #ifdef CONFIG_XEN_PV ENTRY(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1287,16 +1381,17 @@ ENTRY(xen_hypervisor_callback) * iret instruction's behaviour where it delivers a * pending interrupt when enabling interrupts: */ - movl PT_EIP(%esp), %eax - cmpl $xen_iret_start_crit, %eax + cmpl $xen_iret_start_crit, (%esp) jb 1f - cmpl $xen_iret_end_crit, %eax + cmpl $xen_iret_end_crit, (%esp) jae 1f - - jmp xen_iret_crit_fixup - -ENTRY(xen_do_upcall) -1: mov %esp, %eax + call xen_iret_crit_fixup +1: + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + mov %esp, %eax call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall @@ -1378,37 +1473,48 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC pushl $do_page_fault - ALIGN - jmp common_exception + jmp common_exception_read_cr2 END(page_fault) -common_exception: +common_exception_read_cr2: /* the function address is in %gs's slot on the stack */ - pushl %fs - pushl %es - pushl %ds - pushl %eax - movl $(__USER_DS), %eax - movl %eax, %ds - movl %eax, %es - movl $(__KERNEL_PERCPU), %eax - movl %eax, %fs - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - SWITCH_TO_KERNEL_STACK + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 + ENCODE_FRAME_POINTER - cld - UNWIND_ESPFIX_STACK + + /* fixup %gs */ GS_TO_REG %ecx - movl PT_GS(%esp), %edi # get the function address + movl PT_GS(%esp), %edi + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + + GET_CR2_INTO(%ecx) # might clobber %eax + + /* fixup orig %eax */ movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + + TRACE_IRQS_OFF + movl %esp, %eax # pt_regs pointer + CALL_NOSPEC %edi + jmp ret_from_exception +END(common_exception_read_cr2) + +common_exception: + /* the function address is in %gs's slot on the stack */ + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 + ENCODE_FRAME_POINTER + + /* fixup %gs */ + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address REG_TO_PTGS %ecx SET_KERNEL_GS %ecx + + /* fixup orig %eax */ + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer CALL_NOSPEC %edi @@ -1436,6 +1542,10 @@ ENTRY(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 + /* + * ESPFIX_SS is only ever set on the return to user path + * after we've switched to the entry stack. + */ pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1471,6 +1581,11 @@ ENTRY(nmi) movl %ebx, %esp .Lnmi_return: +#ifdef CONFIG_X86_ESPFIX32 + testl $CS_FROM_ESPFIX, PT_CS(%esp) + jnz .Lnmi_from_espfix +#endif + CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 jmp .Lirq_return @@ -1478,23 +1593,42 @@ ENTRY(nmi) #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: /* - * create the pointer to lss back + * Create the pointer to LSS back */ pushl %ss pushl %esp addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL_NMI cr3_reg=%edi + + /* Copy the (short) IRET frame */ + pushl 4*4(%esp) # flags + pushl 4*4(%esp) # cs + pushl 4*4(%esp) # ip + + pushl %eax # orig_ax + + SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1 ENCODE_FRAME_POINTER - FIXUP_ESPFIX_STACK # %eax == %esp + + /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */ + xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp) + xorl %edx, %edx # zero error code - call do_nmi + movl %esp, %eax # pt_regs pointer + jmp .Lnmi_from_sysenter_stack + +.Lnmi_from_espfix: RESTORE_ALL_NMI cr3_reg=%edi - lss 12+4(%esp), %esp # back to espfix stack + /* + * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to + * fix up the gap and long frame: + * + * 3 - original frame (exception) + * 2 - ESPFIX block (above) + * 6 - gap (FIXUP_FRAME) + * 5 - long frame (FIXUP_FRAME) + * 1 - orig_ax + */ + lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif END(nmi) @@ -1513,6 +1647,7 @@ ENTRY(int3) END(int3) ENTRY(general_protection) + ASM_CLAC pushl $do_general_protection jmp common_exception END(general_protection) @@ -1521,7 +1656,7 @@ END(general_protection) ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp common_exception + jmp common_exception_read_cr2 END(async_page_fault) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 69808aaa6851..b83feb7d9464 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -249,7 +249,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ - UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* @@ -258,6 +257,7 @@ syscall_return_via_sysret: */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ @@ -512,7 +512,7 @@ END(spurious_entries_start) * +----------------------------------------------------+ */ ENTRY(interrupt_entry) - UNWIND_HINT_FUNC + UNWIND_HINT_IRET_REGS offset=16 ASM_CLAC cld @@ -544,9 +544,9 @@ ENTRY(interrupt_entry) pushq 5*8(%rdi) /* regs->eflags */ pushq 4*8(%rdi) /* regs->cs */ pushq 3*8(%rdi) /* regs->ip */ + UNWIND_HINT_IRET_REGS pushq 2*8(%rdi) /* regs->orig_ax */ pushq 8(%rdi) /* return address */ - UNWIND_HINT_FUNC movq (%rdi), %rdi jmp 2f @@ -637,6 +637,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode) */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + UNWIND_HINT_EMPTY /* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi) /* SS */ @@ -866,18 +867,84 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) +.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 + + .if \paranoid + call paranoid_entry + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + .else + call error_entry + .endif + UNWIND_HINT_REGS + + .if \read_cr2 + /* + * Store CR2 early so subsequent faults cannot clobber it. Use R12 as + * intermediate storage as RDX can be clobbered in enter_from_user_mode(). + * GET_CR2_INTO can clobber RAX. + */ + GET_CR2_INTO(%r12); + .endif + + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + + .if \paranoid == 0 + testb $3, CS(%rsp) + jz .Lfrom_kernel_no_context_tracking_\@ + CALL_enter_from_user_mode +.Lfrom_kernel_no_context_tracking_\@: + .endif + + movq %rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp), %rsi /* get error code */ + movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi, %esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + .if \read_cr2 + movq %r12, %rdx /* Move CR2 into 3rd argument */ + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + .if \paranoid + /* this procedure expect "no swapgs" flag in ebx */ + jmp paranoid_exit + .else + jmp error_exit + .endif + +.endm + /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point - * @do_sym: C function to be called - * @has_error_code: True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from + * @do_sym: C function to be called + * @has_error_code: True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from * kernel mode with user GSBASE and/or user CR3. * 2 is special -- see below. * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a + * decrement the IST stack so that nested entries get a * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) + * of recursing.) + * @create_gap: create a 6-word stack gap when coming from kernel mode. + * @read_cr2: load CR2 into the 3rd argument; done before calling any C code * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -902,15 +969,19 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 + .if \shift_ist != -1 && \paranoid != 1 .error "using shift_ist requires paranoid=1" .endif + .if \create_gap && \paranoid + .error "using create_gap requires paranoid=0" + .endif + ASM_CLAC .if \has_error_code == 0 @@ -936,47 +1007,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - .if \paranoid - call paranoid_entry - .else - call error_entry - .endif - UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - call \do_sym - - .if \shift_ist != -1 - addq $\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif + idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -985,21 +1016,9 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movq ORIG_RAX(%rsp), %rsi /* get error code */ - movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl %esi, %esi /* no error code */ + idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0 .endif - call \do_sym - - jmp error_exit - .endif _ASM_NOKPROBE(\sym) END(\sym) .endm @@ -1009,7 +1028,7 @@ idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 idtentry invalid_op do_invalid_op has_error_code=0 idtentry device_not_available do_device_not_available has_error_code=0 -idtentry double_fault do_double_fault has_error_code=1 paranoid=2 +idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1 idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 idtentry invalid_TSS do_invalid_TSS has_error_code=1 idtentry segment_not_present do_segment_not_present has_error_code=1 @@ -1176,10 +1195,10 @@ idtentry xendebug do_debug has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 #ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 +idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 #endif #ifdef CONFIG_X86_MCE @@ -1286,20 +1305,11 @@ ENTRY(error_entry) movq %rax, %rsp /* switch stack */ ENCODE_FRAME_POINTER pushq %r12 - - /* - * We need to tell lockdep that IRQs are off. We can't do this until - * we fix gsbase, and we should do it before enter_from_user_mode - * (which can take locks). - */ - TRACE_IRQS_OFF - CALL_enter_from_user_mode ret .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY .Lerror_entry_done: - TRACE_IRQS_OFF ret /* @@ -1706,11 +1716,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC @@ -1719,7 +1735,7 @@ ENTRY(rewind_stack_do_exit) movq PER_CPU_VAR(cpu_current_top_of_stack), %rax leaq -PTREGS_SIZE(%rax), %rsp - UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE + UNWIND_HINT_REGS call do_exit END(rewind_stack_do_exit) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index aa3336a7cb15..7d17b3addbbb 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,13 +10,11 @@ #ifdef CONFIG_IA32_EMULATION /* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); - -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); - +#define __sys_ni_syscall __ia32_sys_ni_syscall #else /* CONFIG_IA32_EMULATION */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#define __sys_ni_syscall sys_ni_syscall #endif /* CONFIG_IA32_EMULATION */ #include <asm/syscalls_32.h> @@ -29,6 +27,6 @@ __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &__sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index d5252bc1e380..adf619a856e8 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -4,22 +4,53 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/syscalls.h> #include <asm/asm-offsets.h> #include <asm/syscall.h> -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +extern asmlinkage long sys_ni_syscall(void); + +SYSCALL_DEFINE0(ni_syscall) +{ + return sys_ni_syscall(); +} + #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); +#define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include <asm/syscalls_64.h> #undef __SYSCALL_64 +#undef __SYSCALL_X32 #define __SYSCALL_64(nr, sym, qual) [nr] = sym, +#define __SYSCALL_X32(nr, sym, qual) asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, +#include <asm/syscalls_64.h> +}; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#ifdef CONFIG_X86_X32_ABI + +#define __SYSCALL_64(nr, sym, qual) +#define __SYSCALL_X32(nr, sym, qual) [nr] = sym, + +asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_x32_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; + +#undef __SYSCALL_64 +#undef __SYSCALL_X32 + +#endif diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ad968b7bac72..8e5a3cc5eba2 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -124,13 +124,13 @@ 110 i386 iopl sys_iopl __ia32_sys_iopl 111 i386 vhangup sys_vhangup __ia32_sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys_ni_syscall +113 i386 vm86old sys_vm86old __ia32_sys_ni_syscall 114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 115 i386 swapoff sys_swapoff __ia32_sys_swapoff 116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo 117 i386 ipc sys_ipc __ia32_compat_sys_ipc 118 i386 fsync sys_fsync __ia32_sys_fsync -119 i386 sigreturn sys_sigreturn sys32_sigreturn +119 i386 sigreturn sys_sigreturn __ia32_compat_sys_sigreturn 120 i386 clone sys_clone __ia32_compat_sys_x86_clone 121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname 122 i386 uname sys_newuname __ia32_sys_newuname @@ -177,20 +177,20 @@ 163 i386 mremap sys_mremap __ia32_sys_mremap 164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 -166 i386 vm86 sys_vm86 sys_ni_syscall +166 i386 vm86 sys_vm86 __ia32_sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll __ia32_sys_poll 169 i386 nfsservctl 170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 172 i386 prctl sys_prctl __ia32_sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn __ia32_compat_sys_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction -175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask +175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending 177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo -179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend +179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_compat_sys_rt_sigsuspend 180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread 181 i386 pwrite64 sys_pwrite64 __ia32_compat_sys_x86_pwrite 182 i386 chown sys_chown16 __ia32_sys_chown16 diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh index 94fcd1951aca..1af2be39e7d9 100644 --- a/arch/x86/entry/syscalls/syscalltbl.sh +++ b/arch/x86/entry/syscalls/syscalltbl.sh @@ -1,13 +1,13 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0 in="$1" out="$2" syscall_macro() { - abi="$1" - nr="$2" - entry="$3" + local abi="$1" + local nr="$2" + local entry="$3" # Entry can be either just a function name or "function/qualifier" real_entry="${entry%%/*}" @@ -21,14 +21,14 @@ syscall_macro() { } emit() { - abi="$1" - nr="$2" - entry="$3" - compat="$4" - umlentry="" + local abi="$1" + local nr="$2" + local entry="$3" + local compat="$4" + local umlentry="" - if [ "$abi" = "64" -a -n "$compat" ]; then - echo "a compat entry for a 64-bit syscall makes no sense" >&2 + if [ "$abi" != "I386" -a -n "$compat" ]; then + echo "a compat entry ($abi: $compat) for a 64-bit syscall makes no sense" >&2 exit 1 fi @@ -62,14 +62,17 @@ grep '^[0-9]' "$in" | sort -n | ( while read nr abi name entry compat; do abi=`echo "$abi" | tr '[a-z]' '[A-Z]'` if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then - # COMMON is the same as 64, except that we don't expect X32 - # programs to use it. Our expectation has nothing to do with - # any generated code, so treat them the same. emit 64 "$nr" "$entry" "$compat" + if [ "$abi" = "COMMON" ]; then + # COMMON means that this syscall exists in the same form for + # 64-bit and X32. + echo "#ifdef CONFIG_X86_X32_ABI" + emit X32 "$nr" "$entry" "$compat" + echo "#endif" + fi elif [ "$abi" = "X32" ]; then - # X32 is equivalent to 64 on an X32-compatible kernel. echo "#ifdef CONFIG_X86_X32_ABI" - emit 64 "$nr" "$entry" "$compat" + emit X32 "$nr" "$entry" "$compat" echo "#endif" elif [ "$abi" = "I386" ]; then emit "$abi" "$nr" "$entry" "$compat" diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index 42d4c89f990e..ddff0ca6f509 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/kernel.h> #include <linux/mm_types.h> +#include <linux/elf.h> #include <asm/processor.h> #include <asm/vdso.h> |