From 59df2268311795ee7161776248d5288ee0841d41 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 20 Oct 2016 01:11:08 +0600 Subject: [PATCH 01/40] x86/entry/64: Remove unused 'addskip' parameter of the ALLOC_PT_GPREGS_ON_STACK macro Signed-off-by: Alexander Kuleshov Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161019191108.2230-1-kuleshovmail@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9a9e5884066c..b493c6a42a44 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -90,8 +90,8 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - addq $-(15*8+\addskip), %rsp + .macro ALLOC_PT_GPREGS_ON_STACK + addq $-(15*8), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 From 1b00255f32b94b61897c83a22c05ed7d33229957 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:03:59 -0500 Subject: [PATCH 02/40] x86/entry/32, x86/boot/32: Use local labels Add the local label prefix to all non-function named labels in head_32.S and entry_32.S. In addition to decluttering the symbol table, it also will help stack traces to be more sensible. For example, the last reported function in the idle task stack trace will be startup_32_smp() instead of is486(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14f9f7afd478b23a762f40734da1a57c0c273f6e.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 43 ++++++++++++++++++++------------------- arch/x86/kernel/head_32.S | 32 ++++++++++++++--------------- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21b352a11b49..b03c1cb30145 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -307,13 +307,13 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: +.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq - jmp need_resched + jmp .Lneed_resched END(resume_kernel) #endif @@ -334,7 +334,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) */ ENTRY(xen_sysenter_target) addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + jmp .Lsysenter_past_esp #endif /* @@ -371,7 +371,7 @@ ENTRY(xen_sysenter_target) */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp -sysenter_past_esp: +.Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ @@ -504,9 +504,9 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET -restore_all_notrace: +.Lrestore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* @@ -518,22 +518,23 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + je .Lldt_ss # returning to user-space with LDT SS #endif -restore_nocheck: +.Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: +.Lirq_return: INTERRUPT_RETURN + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error jmp error_code .previous - _ASM_EXTABLE(irq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 -ldt_ss: +.Lldt_ss: /* * Setup and switch to ESPFIX stack * @@ -562,7 +563,7 @@ ldt_ss: */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + jmp .Lrestore_nocheck #endif ENDPROC(entry_INT80_32) @@ -882,7 +883,7 @@ ftrace_call: popl %edx popl %ecx popl %eax -ftrace_ret: +.Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: @@ -952,7 +953,7 @@ GLOBAL(ftrace_regs_call) popl %gs addl $8, %esp /* Skip orig_ax and ip */ popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + jmp .Lftrace_ret popf jmp ftrace_stub @@ -963,7 +964,7 @@ ENTRY(mcount) jb ftrace_stub /* Paging not enabled yet? */ cmpl $ftrace_stub, ftrace_trace_function - jnz trace + jnz .Ltrace #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -976,7 +977,7 @@ ftrace_stub: ret /* taken from glibc */ -trace: +.Ltrace: pushl %eax pushl %ecx pushl %edx @@ -1116,7 +1117,7 @@ ENTRY(nmi) movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax - je nmi_espfix_stack + je .Lnmi_espfix_stack #endif pushl %eax # pt_regs->orig_ax @@ -1132,7 +1133,7 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp restore_all_notrace + jmp .Lrestore_all_notrace .Lnmi_from_sysenter_stack: /* @@ -1143,10 +1144,10 @@ ENTRY(nmi) movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi movl %ebp, %esp - jmp restore_all_notrace + jmp .Lrestore_all_notrace #ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: +.Lnmi_espfix_stack: /* * create the pointer to lss back */ @@ -1164,7 +1165,7 @@ nmi_espfix_stack: call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + jmp .Lirq_return #endif END(nmi) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b6b2f0264af3..65e62256df43 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -248,19 +248,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20); #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) - jb default_entry + jb .Ldefault_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax - jae bad_subarch + jae .Lbad_subarch movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax -bad_subarch: +.Lbad_subarch: WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really @@ -270,14 +270,14 @@ WEAK(xen_entry) __INITDATA subarch_entries: - .long default_entry /* normal x86/PC */ + .long .Ldefault_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ + .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #else - jmp default_entry + jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_HOTPLUG_CPU @@ -317,7 +317,7 @@ ENTRY(startup_32_smp) call load_ucode_ap #endif -default_entry: +.Ldefault_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ X86_CR0_PG) @@ -347,7 +347,7 @@ default_entry: pushfl popl %eax # get EFLAGS testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? - jz enable_paging # hw disallowed setting of ID bit + jz .Lenable_paging # hw disallowed setting of ID bit # which means no CPUID and no CR4 xorl %eax,%eax @@ -357,13 +357,13 @@ default_entry: movl $1,%eax cpuid andl $~1,%edx # Ignore CPUID.FPU - jz enable_paging # No flags or only CPUID.FPU = no CR4 + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz enable_paging + jz .Lenable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -371,7 +371,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja enable_paging + ja .Lenable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -380,7 +380,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc enable_paging + jnc .Lenable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -390,7 +390,7 @@ default_entry: /* Make changes effective */ wrmsr -enable_paging: +.Lenable_paging: /* * Enable paging @@ -419,7 +419,7 @@ enable_paging: */ movb $4,X86 # at least 486 cmpl $-1,X86_CPUID - je is486 + je .Lis486 /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -430,7 +430,7 @@ enable_paging: movl %ecx,X86_VENDOR_ID+8 # last 4 chars orl %eax,%eax # do we have processor info as well? - je is486 + je .Lis486 movl $1,%eax # Use the CPUID instruction to get CPU type cpuid @@ -444,7 +444,7 @@ enable_paging: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: +.Lis486: movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET From 7252c4c35e94863bc13f37b4faad3087250a1588 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:00 -0500 Subject: [PATCH 03/40] x86/entry/32: Rename 'error_code' to 'common_exception' The 'error_code' label is awkwardly named, especially when it shows up in a stack trace. Move it to its own local function and rename it to 'common_exception', analagous to the existing 'common_interrupt'. This also makes related stack traces more sensible. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cca1734a93e52799556d946281b32468f9b93950.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 43 +++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b03c1cb30145..9bd0087ed16f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -529,7 +529,7 @@ restore_all: ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error - jmp error_code + jmp common_exception .previous _ASM_EXTABLE(.Lirq_return, iret_exc) @@ -660,7 +660,7 @@ ENTRY(coprocessor_error) ASM_CLAC pushl $0 pushl $do_coprocessor_error - jmp error_code + jmp common_exception END(coprocessor_error) ENTRY(simd_coprocessor_error) @@ -674,14 +674,14 @@ ENTRY(simd_coprocessor_error) #else pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp common_exception END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC pushl $-1 # mark this as an int pushl $do_device_not_available - jmp error_code + jmp common_exception END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -695,59 +695,59 @@ ENTRY(overflow) ASM_CLAC pushl $0 pushl $do_overflow - jmp error_code + jmp common_exception END(overflow) ENTRY(bounds) ASM_CLAC pushl $0 pushl $do_bounds - jmp error_code + jmp common_exception END(bounds) ENTRY(invalid_op) ASM_CLAC pushl $0 pushl $do_invalid_op - jmp error_code + jmp common_exception END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC pushl $0 pushl $do_coprocessor_segment_overrun - jmp error_code + jmp common_exception END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS - jmp error_code + jmp common_exception END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC pushl $do_segment_not_present - jmp error_code + jmp common_exception END(segment_not_present) ENTRY(stack_segment) ASM_CLAC pushl $do_stack_segment - jmp error_code + jmp common_exception END(stack_segment) ENTRY(alignment_check) ASM_CLAC pushl $do_alignment_check - jmp error_code + jmp common_exception END(alignment_check) ENTRY(divide_error) ASM_CLAC pushl $0 # no error code pushl $do_divide_error - jmp error_code + jmp common_exception END(divide_error) #ifdef CONFIG_X86_MCE @@ -755,7 +755,7 @@ ENTRY(machine_check) ASM_CLAC pushl $0 pushl machine_check_vector - jmp error_code + jmp common_exception END(machine_check) #endif @@ -763,7 +763,7 @@ ENTRY(spurious_interrupt_bug) ASM_CLAC pushl $0 pushl $do_spurious_interrupt_bug - jmp error_code + jmp common_exception END(spurious_interrupt_bug) #ifdef CONFIG_XEN @@ -1028,7 +1028,7 @@ return_to_handler: ENTRY(trace_page_fault) ASM_CLAC pushl $trace_do_page_fault - jmp error_code + jmp common_exception END(trace_page_fault) #endif @@ -1036,7 +1036,10 @@ ENTRY(page_fault) ASM_CLAC pushl $do_page_fault ALIGN -error_code: + jmp common_exception +END(page_fault) + +common_exception: /* the function address is in %gs's slot on the stack */ pushl %fs pushl %es @@ -1065,7 +1068,7 @@ error_code: movl %esp, %eax # pt_regs pointer call *%edi jmp ret_from_exception -END(page_fault) +END(common_exception) ENTRY(debug) /* @@ -1182,14 +1185,14 @@ END(int3) ENTRY(general_protection) pushl $do_general_protection - jmp error_code + jmp common_exception END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp error_code + jmp common_exception END(async_page_fault) #endif From 4d516f41704055710da872b62ddc4b6d23248984 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:01 -0500 Subject: [PATCH 04/40] x86/entry/32: Fix the end of the stack for newly forked tasks Thanks to all the recent x86 entry code refactoring, most tasks' kernel stacks start at the same offset right below their saved pt_regs, regardless of which syscall was used to enter the kernel. That creates a nice convention which makes it straightforward to identify the end of the stack, which can be useful for the unwinder to verify the stack is sane. Calling schedule_tail() directly breaks that convention because its an asmlinkage function so its argument has to be pushed on the stack. Add a wrapper which creates a proper "end of stack" frame header before the call. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ecafcd882676bf48ceaf50483782552bb98476e5.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 9bd0087ed16f..22251055c5b7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,6 +45,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -237,6 +238,23 @@ ENTRY(__switch_to_asm) jmp __switch_to END(__switch_to_asm) +/* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) /* * A newly forked process directly context switches into this address. * @@ -245,9 +263,7 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ From 6616a147a79c6fc280572f5a993e9e5ebd200d24 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:02 -0500 Subject: [PATCH 05/40] x86/boot/32: Fix the end of the stack for idle tasks The frame at the end of each idle task stack is inconsistent with real task stacks, which have a stack frame header and a real return address before the pt_regs area. This inconsistency can be confusing for stack unwinders. It also hides useful information about what asm code was involved in calling into C. Fix that by changing the initial code jumps to calls. Also add infinite loops after the calls to make it clear that the calls don't return, and to hang if they do. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2588f34b6fbac4ae6f6f9ead2a78d7f8d58a6341.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 65e62256df43..9a6f8e820ae1 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -289,7 +289,8 @@ num_subarch_entries = (. - subarch_entries) / 4 ENTRY(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp - jmp *(initial_code) + call *(initial_code) +1: jmp 1b ENDPROC(start_cpu0) #endif @@ -470,8 +471,9 @@ ENTRY(startup_32_smp) xorl %eax,%eax # Clear LDT lldt %ax - pushl $0 # fake return address for unwinder - jmp *(initial_code) + call *(initial_code) +1: jmp 1b +ENDPROC(startup_32_smp) #include "verify_cpu.S" From b9b1a9c363ff7b17b2a35e20e28e86a449cfde1f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:03 -0500 Subject: [PATCH 06/40] x86/boot/smp/32: Fix initial idle stack location on 32-bit kernels On 32-bit kernels, the initial idle stack calculation doesn't take into account the TOP_OF_KERNEL_STACK_PADDING, making the stack end address inconsistent with other tasks on 32-bit. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6cf569410bfa84cf923902fc4d628444cace94be.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 951f093a96fe..dcbd45ad8db6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -964,9 +964,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(idle))) - 1); - + idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; From a9468df5ad48d06e5237fc2b56fb04c52f79c8c4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:04 -0500 Subject: [PATCH 07/40] x86/boot/64: Use a common function for starting CPUs There are two different pieces of code for starting a CPU: start_cpu0() and the end of secondary_startup_64(). They're identical except for the stack setup. Combine the common parts into a shared start_cpu() function. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1d692ffa62fcb3cc835a5b254e953f2d9bab3549.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..363177790110 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,13 +265,17 @@ ENTRY(secondary_startup_64) movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx - wrmsr + wrmsr /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - - /* Finally jump to run C code and to be on real kernel address + jmp start_cpu +ENDPROC(secondary_startup_64) + +ENTRY(start_cpu) + /* + * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this @@ -300,7 +304,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq -ENDPROC(secondary_startup_64) +ENDPROC(start_cpu) #include "verify_cpu.S" @@ -308,15 +312,11 @@ ENDPROC(secondary_startup_64) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary(). + * start_secondary() via start_cpu(). */ ENTRY(start_cpu0) - movq initial_stack(%rip),%rsp - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + movq initial_stack(%rip), %rsp + jmp start_cpu ENDPROC(start_cpu0) #endif From 595c1e645d9fd8561104b5680931f68a429aaa1c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:05 -0500 Subject: [PATCH 08/40] x86/boot/64: Put a real return address on the idle task stack The frame at the end of each idle task stack has a zeroed return address. This is inconsistent with real task stacks, which have a real return address at that spot. This inconsistency can be confusing for stack unwinders. It also hides useful information about what asm code was involved in calling into C. Make it a real address by using the side effect of a call instruction to push the instruction pointer on the stack. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f59593ae7b15d5126f872b0a23143173d28aa32d.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 363177790110..1c5e5dba5704 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -299,8 +299,9 @@ ENTRY(start_cpu) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder + call 1f # put return address on stack for unwinder +1: xorq %rbp, %rbp # clear frame pointer + movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq From 22dc391865af29a1332bd1d17152f2ca7188bc4a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:06 -0500 Subject: [PATCH 09/40] x86/boot: Fix the end of the stack for idle tasks Thanks to all the recent x86 entry code refactoring, most tasks' kernel stacks start at the same offset right below their saved pt_regs, regardless of which syscall was used to enter the kernel. That creates a nice convention which makes it straightforward to identify the end of the stack, which can be useful for the unwinder to verify the stack is sane. However, the boot CPU's idle "swapper" task doesn't follow that convention. Fix that by starting its stack at a sizeof(pt_regs) offset from the end of the stack page. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/81aee3beb6ed88e44f1bea6986bb7b65c368f77a.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 9 ++++++++- arch/x86/kernel/head_64.S | 15 +++++++-------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 9a6f8e820ae1..df541ac2071e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -63,6 +63,8 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +#define SIZEOF_PTREGS 17*4 + /* * Number of possible pages in the lowmem region. * @@ -708,7 +710,12 @@ ENTRY(initial_page_table) .data .balign 4 ENTRY(initial_stack) - .long init_thread_union+THREAD_SIZE + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ + TOP_OF_KERNEL_STACK_PADDING; __INITRODATA int_msg: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1c5e5dba5704..b07cd27e33a5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,13 +66,8 @@ startup_64: * tables and then reload them. */ - /* - * Setup stack for verify_cpu(). "-8" because initial_stack is defined - * this way, see below. Our best guess is a NULL ptr for stack - * termination heuristics and we don't want to break anything which - * might depend on it (kgdb, ...). - */ - leaq (__end_init_task - 8)(%rip), %rsp + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp /* Sanitize CPU configuration */ call verify_cpu @@ -329,7 +324,11 @@ ENDPROC(start_cpu0) GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) - .quad init_thread_union+THREAD_SIZE-8 + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA bad_address: From e728f61ce05404a26447e6bbc1885ca5956d6a44 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 21 Sep 2016 16:04:07 -0500 Subject: [PATCH 10/40] x86/boot: Move the _stext marker to before the boot code When core_kernel_text() is used to determine whether an address on a task's stack trace is a kernel text address, it incorrectly returns false for early text addresses for the head code between the _text and _stext markers. Among other things, this can cause the unwinder to behave incorrectly when unwinding to x86 head code. Head code is text code too, so mark it as such. This seems to match the intent of other users of the _stext symbol, and it also seems consistent with what other architectures are already doing. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/789cf978866420e72fa89df44aa2849426ac378d.1474480779.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index dbf67f64d5ec..e79f15f108a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -91,10 +91,10 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; /* bootstrapping code */ HEAD_TEXT . = ALIGN(8); - _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT From 29a6d7964d6853f5bcd84dfb92c074fb41d00563 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 20 Oct 2016 18:07:04 +0600 Subject: [PATCH 11/40] entry/64: Remove unused ZERO_EXTRA_REGS macro Signed-off-by: Alexander Kuleshov Cc: Paolo Bonzini Cc: Borislav Petkov Cc: Andy Lutomirski Link: http://lkml.kernel.org/r/20161020120704.24042-1-kuleshovmail@gmail.com Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index b493c6a42a44..38dcdfa2be55 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -147,15 +147,6 @@ For 32-bit we have the following conventions - kernel is built with movq 5*8+\offset(%rsp), %rbx .endm - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 From 946c191161cef10c667b5ee3179db1714fa5b7c0 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:40 -0500 Subject: [PATCH 12/40] x86/entry/unwind: Create stack frames for saved interrupt registers With frame pointers, when a task is interrupted, its stack is no longer completely reliable because the function could have been interrupted before it had a chance to save the previous frame pointer on the stack. So the caller of the interrupted function could get skipped by a stack trace. This is problematic for live patching, which needs to know whether a stack trace of a sleeping task can be relied upon. There's currently no way to detect if a sleeping task was interrupted by a page fault exception or preemption before it went to sleep. Another issue is that when dumping the stack of an interrupted task, the unwinder has no way of knowing where the saved pt_regs registers are, so it can't print them. This solves those issues by encoding the pt_regs pointer in the frame pointer on entry from an interrupt or an exception. This patch also updates the unwinder to be able to decode it, because otherwise the unwinder would be broken by this change. Note that this causes a change in the behavior of the unwinder: each instance of a pt_regs on the stack is now considered a "frame". So callers of unwind_get_return_address() will now get an occasional 'regs->ip' address that would have previously been skipped over. Suggested-by: Andy Lutomirski Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8b9f84a21e39d249049e0547b559ff8da0df0988.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 20 +++++++++ arch/x86/entry/entry_32.S | 33 +++++++++++++-- arch/x86/entry/entry_64.S | 10 +++-- arch/x86/include/asm/unwind.h | 16 ++++++- arch/x86/kernel/unwind_frame.c | 76 ++++++++++++++++++++++++++++++---- 5 files changed, 139 insertions(+), 16 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 38dcdfa2be55..05ed3d393da7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -192,6 +192,26 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +#ifdef CONFIG_FRAME_POINTER + .if \ptregs_offset + leaq \ptregs_offset(%rsp), %rbp + .else + mov %rsp, %rbp + .endif + orq $0x1, %rbp +#endif +.endm + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 22251055c5b7..acc0c6f36f3f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -176,6 +176,22 @@ SET_KERNEL_GS %edx .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original rbp. + */ +.macro ENCODE_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER + mov %esp, %ebp + orl $0x1, %ebp +#endif +.endm + .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -641,6 +657,7 @@ common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax call do_IRQ @@ -652,6 +669,7 @@ ENTRY(name) \ ASM_CLAC; \ pushl $~(nr); \ SAVE_ALL; \ + ENCODE_FRAME_POINTER; \ TRACE_IRQS_OFF \ movl %esp, %eax; \ call fn; \ @@ -786,6 +804,7 @@ END(spurious_interrupt_bug) ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF /* @@ -840,6 +859,7 @@ ENTRY(xen_failsafe_callback) jmp iret_exc 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER jmp ret_from_exception .section .fixup, "ax" @@ -1067,6 +1087,7 @@ common_exception: pushl %edx pushl %ecx pushl %ebx + ENCODE_FRAME_POINTER cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1099,6 +1120,7 @@ ENTRY(debug) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer @@ -1114,11 +1136,11 @@ ENTRY(debug) .Ldebug_from_sysenter_stack: /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp TRACE_IRQS_OFF call do_debug - movl %ebp, %esp + movl %ebx, %esp jmp ret_from_exception END(debug) @@ -1141,6 +1163,7 @@ ENTRY(nmi) pushl %eax # pt_regs->orig_ax SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1159,10 +1182,10 @@ ENTRY(nmi) * We're on the SYSENTER stack. Switch off. No one (not even debug) * is using the thread stack right now, so it's safe for us to use it. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi - movl %ebp, %esp + movl %ebx, %esp jmp .Lrestore_all_notrace #ifdef CONFIG_X86_ESPFIX32 @@ -1179,6 +1202,7 @@ ENTRY(nmi) .endr pushl %eax SAVE_ALL + ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi @@ -1192,6 +1216,7 @@ ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a358b37..65fad8a63cd0 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -469,6 +469,7 @@ END(irq_entries_start) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER testb $3, CS(%rsp) jz 1f @@ -985,6 +986,7 @@ ENTRY(xen_failsafe_callback) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) @@ -1028,6 +1030,7 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx rdmsr @@ -1075,6 +1078,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1257,6 +1261,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + ENCODE_FRAME_POINTER /* * At this point we no longer need to worry about stack damage @@ -1270,11 +1275,10 @@ ENTRY(nmi) /* * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. Fortunately, - * do_nmi doesn't modify pt_regs. + * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret .Lnmi_from_kernel: /* diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 46de9ac4b990..c5a7f3a930dd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -13,6 +13,7 @@ struct unwind_state { int graph_idx; #ifdef CONFIG_FRAME_POINTER unsigned long *bp; + struct pt_regs *regs; #else unsigned long *sp; #endif @@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) if (unwind_done(state)) return NULL; - return state->bp + 1; + return state->regs ? &state->regs->ip : state->bp + 1; +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs; } #else /* !CONFIG_FRAME_POINTER */ @@ -58,6 +67,11 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return NULL; +} + #endif /* CONFIG_FRAME_POINTER */ #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a2456d4d286a..2221ab1678c5 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -14,6 +14,9 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (unwind_done(state)) return 0; + if (state->regs && user_mode(state->regs)) + return 0; + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, addr_p); @@ -21,6 +24,20 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} + static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { @@ -43,26 +60,59 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { - unsigned long *next_bp; + struct pt_regs *regs; + unsigned long *next_bp, *next_frame; + size_t next_len; if (unwind_done(state)) return false; - next_bp = (unsigned long *)*state->bp; + /* have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + /* get the next frame pointer */ + if (state->regs) + next_bp = (unsigned long *)state->regs->bp; + else + next_bp = (unsigned long *)*state->bp; + + /* is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + next_frame = (unsigned long *)regs; + next_len = sizeof(*regs); + } else { + next_frame = next_bp; + next_len = FRAME_HEADER_SIZE; + } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + if (!update_stack_state(state, next_frame, next_len)) return false; - /* move to the next frame */ - state->bp = next_bp; + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + return true; + +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; } EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + unsigned long *bp, *frame; + size_t len; + memset(state, 0, sizeof(*state)); state->task = task; @@ -73,12 +123,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } /* set up the starting stack frame */ - state->bp = get_frame_pointer(task, regs); + bp = get_frame_pointer(task, regs); + regs = decode_frame_pointer(bp); + if (regs) { + state->regs = regs; + frame = (unsigned long *)regs; + len = sizeof(*regs); + } else { + state->bp = bp; + frame = bp; + len = FRAME_HEADER_SIZE; + } /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(state->bp, state->task, &state->stack_info, + get_stack_info(frame, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + update_stack_state(state, frame, len); /* * The caller can provide the address of the first frame directly From acb4608ad1865a42af8e0a2db332a7c3a381e1f5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:41 -0500 Subject: [PATCH 13/40] x86/unwind: Create stack frames for saved syscall registers The entry code doesn't encode the pt_regs pointer for syscalls. But the pt_regs are always at the same location, so we can add a manual check for them. A later patch prints them as part of the oops stack dump. They could be useful, for example, to determine the arguments to a system call. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e176aa9272930cd3f51fda0b94e2eae356677da4.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 35 ++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 2221ab1678c5..579542736b7e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -24,6 +24,14 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static bool is_last_task_frame(struct unwind_state *state) +{ + unsigned long bp = (unsigned long)state->bp; + unsigned long regs = (unsigned long)task_pt_regs(state->task); + + return bp == regs - FRAME_HEADER_SIZE; +} + /* * This determines if the frame pointer actually contains an encoded pointer to * pt_regs on the stack. See ENCODE_FRAME_POINTER. @@ -71,6 +79,33 @@ bool unwind_next_frame(struct unwind_state *state) if (state->regs && user_mode(state->regs)) goto the_end; + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread_tls(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * do_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + return true; + } + /* get the next frame pointer */ if (state->regs) next_bp = (unsigned long *)state->regs->bp; From 79439d8e15b51fa359a0f5d0c8f856c1f5b4bd56 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:42 -0500 Subject: [PATCH 14/40] x86/dumpstack: Print stack identifier on its own line show_trace_log_lvl() prints the stack id (e.g. "") without a newline so that any stack address printed after it will appear on the same line. That causes the first stack address to be vertically misaligned with the rest, making it visually cluttered and slightly confusing: Call Trace: [] dump_stack+0x86/0xc3 [] perf_callchain_kernel+0x14b/0x160 [] get_perf_callchain+0x15f/0x2b0 ... [] ? _raw_spin_unlock_irq+0x33/0x60 [] finish_task_switch+0xb4/0x250 [] do_async_page_fault+0x2c/0xa0 It will look worse once we start printing pt_regs registers found in the middle of the stack: RIP: 0010:[] [] _raw_spin_unlock_irq+0x33/0x60 RSP: 0018:ffff88007876f720 EFLAGS: 00000206 RAX: ffff8800786caa40 RBX: ffff88007d5da140 RCX: 0000000000000007 ... Improve readability by adding a newline to the stack name: Call Trace: [] dump_stack+0x86/0xc3 [] perf_callchain_kernel+0x14b/0x160 [] get_perf_callchain+0x15f/0x2b0 ... [] ? _raw_spin_unlock_irq+0x33/0x60 [] finish_task_switch+0xb4/0x250 [] do_async_page_fault+0x2c/0xa0 Now that "continued" lines are no longer needed, we can also remove the hack of using the empty string (aka KERN_CONT) and replace it with KERN_DEFAULT. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9bdd6dee2c74555d45500939fcc155997dc7889e.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9b7cf5c28f5f..32511772b424 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -97,7 +97,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, stack_type_str(stack_info.type, &str_begin, &str_end); if (str_begin) - printk("%s <%s> ", log_lvl, str_begin); + printk("%s <%s>\n", log_lvl, str_begin); /* * Scan the stack, printing any text addresses we find. At the @@ -149,7 +149,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, } if (str_end) - printk("%s <%s> ", log_lvl, str_end); + printk("%s <%s>\n", log_lvl, str_end); } } @@ -164,12 +164,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, ""); + show_stack_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, ""); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; From 3b3fa11bc7000bb86c9fd30703da3689a9a9758d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:43 -0500 Subject: [PATCH 15/40] x86/dumpstack: Print any pt_regs found on the stack Now that we can find pt_regs registers on the stack, print them. Here's an example of what it looks like: Call Trace: [] dump_stack+0x86/0xc3 [] hrtimer_interrupt+0xb3/0x1c0 [] local_apic_timer_interrupt+0x36/0x60 [] smp_apic_timer_interrupt+0x3d/0x50 [] apic_timer_interrupt+0x9e/0xb0 RIP: 0010:[] [] _raw_spin_unlock_irq+0x33/0x60 RSP: 0018:ffff880079c4f760 EFLAGS: 00000202 RAX: ffff880078738000 RBX: ffff88007d3da0c0 RCX: 0000000000000007 RDX: 0000000000006d78 RSI: ffff8800787388f0 RDI: ffff880078738000 RBP: ffff880079c4f768 R08: 0000002199088f38 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff81e0d540 R13: ffff8800369fb700 R14: 0000000000000000 R15: ffff880078738000 [] finish_task_switch+0xb4/0x250 [] ? finish_task_switch+0x76/0x250 [] __schedule+0x3e1/0xb20 ... [] trace_do_page_fault+0x58/0x2c0 [] do_async_page_fault+0x2c/0xa0 [] async_page_fault+0x28/0x30 RIP: 0010:[] [] __clear_user+0x42/0x70 RSP: 0018:ffff880079c4fd38 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 0000000000000138 RCX: 0000000000000138 RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000061b640 RBP: ffff880079c4fd48 R08: 0000002198feefd7 R09: ffffffff82a40928 R10: 0000000000000001 R11: 0000000000000000 R12: 000000000061b640 R13: 0000000000000000 R14: ffff880079c50000 R15: ffff8800791d7400 [] ? __clear_user+0x23/0x70 [] clear_user+0x2b/0x40 [] load_elf_binary+0x1472/0x1750 [] search_binary_handler+0xa1/0x200 [] do_execveat_common.isra.36+0x6cb/0x9f0 [] ? do_execveat_common.isra.36+0x623/0x9f0 [] SyS_execve+0x3a/0x50 [] do_syscall_64+0x6c/0x1e0 [] entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:[<00007fd2e2f2e537>] [<00007fd2e2f2e537>] 0x7fd2e2f2e537 RSP: 002b:00007ffc449c5fc8 EFLAGS: 00000246 RAX: ffffffffffffffda RBX: 00007ffc449c8860 RCX: 00007fd2e2f2e537 RDX: 000000000127cc40 RSI: 00007ffc449c8860 RDI: 00007ffc449c6029 RBP: 00007ffc449c60b0 R08: 65726f632d667265 R09: 00007ffc449c5e20 R10: 00000000000005a7 R11: 0000000000000246 R12: 000000000127cc40 R13: 000000000127ce05 R14: 00007ffc449c6029 R15: 000000000127ce01 Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5cc2c512ec82cfba00dd22467644d4ed751a48c0.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 32511772b424..64281a1d4e48 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -82,7 +82,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (; stack; stack = stack_info.next_sp) { + for (regs = NULL; stack; stack = stack_info.next_sp) { const char *str_begin, *str_end; /* @@ -119,6 +119,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!__kernel_text_address(addr)) continue; + /* + * Don't print regs->ip again if it was already printed + * by __show_regs() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); + continue; + } + if (stack == ret_addr_p) reliable = 1; @@ -146,6 +155,11 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * of the addresses will just be printed as unreliable. */ unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) + __show_regs(regs, 0); } if (str_end) From 1141c3e39c64e4aba2d98cb3dcca95369c9dafbe Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:44 -0500 Subject: [PATCH 16/40] x86/dumpstack: Fix duplicate RIP address display in __show_regs() The RIP address is shown twice in __show_regs(). Before: RIP: 0010:[] [] native_write_msr+0x6/0x30 After: RIP: 0010:[] native_write_msr+0x6/0x30 Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b3fda66f36761759b000883b059cdd9a7649dcc1.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..b3b50ac6a302 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -61,8 +61,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] %pS\n", regs->cs & 0xffff, + regs->ip, (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", From 6fa81a12b3f1834a22167aa2d7b24dcc4bf884e3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Oct 2016 11:34:45 -0500 Subject: [PATCH 17/40] x86/dumpstack: Print orig_ax in __show_regs() The value of regs->orig_ax contains potentially useful debugging data: For syscalls it contains the syscall number. For interrupts it contains the (negated) vector number. To reduce noise, print it only if it has a useful value (i.e., something other than -1). Here's what it looks like for a write syscall: RIP: 0033:[<00007f53ad7b1940>] 0x7f53ad7b1940 RSP: 002b:00007fff8de66558 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000046 RCX: 00007f53ad7b1940 RDX: 0000000000000002 RSI: 00007f53ae0ca000 RDI: 0000000000000001 ... Suggested-by: Andy Lutomirski Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/93f0fe0307a4af884d3fca00edabcc8cff236002.1476973742.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3b50ac6a302..f1c36da4c9b5 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -63,8 +63,13 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] %pS\n", regs->cs & 0xffff, regs->ip, (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, regs->sp, regs->flags); + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", From 5e25d5bdf6d407224ad185a3fb8b870ad7d6c627 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Sun, 23 Oct 2016 19:56:46 +0600 Subject: [PATCH 18/40] x86/entry64: Remove unused audit related macros These macros were added in the following commit: 86a1c34a929f ("x86_64 syscall audit fast-path") They were used in two-phase sycalls entry tracing, but this functionality was then moved to the arch/x86/entry/common.c:syscall_trace_enter() function, in the following commit: 1f484aa69046 ("x86/entry: Move C entry and exit code to arch/x86/entry/common.c") syscall_trace_enter() now uses the defines from , so these defines entry_64.S are no longer used anywhere. Signed-off-by: Alexander Kuleshov Reviewed-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161023135646.4453-1-kuleshovmail@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 65fad8a63cd0..5b219707c2f2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,12 +38,6 @@ #include #include -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - .code64 .section .entry.text, "ax" From efdb4167e676aaba7505bec739785b76e206cb45 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:11 -0500 Subject: [PATCH 19/40] scripts/faddr2line: Fix "size mismatch" error I'm not sure how we missed this problem before. When I take a function address and size from an oops and give it to faddr2line, it usually complains about a size mismatch: $ scripts/faddr2line ~/k/vmlinux write_sysrq_trigger+0x51/0x60 skipping write_sysrq_trigger address at 0xffffffff815731a1 due to size mismatch (0x60 != 83) no match for write_sysrq_trigger+0x51/0x60 The problem is caused by differences in how kallsyms and faddr2line determine a function's size. kallsyms calculates a function's size by parsing the output of 'nm -n' and subtracting the next function's address from the current function's address. This means that nop instructions after the end of the function are included in the size. In contrast, faddr2line reads the size from the symbol table, which does *not* include the ending nops in the function's size. Change faddr2line to calculate the size from the output of 'nm -n' to be consistent with kallsyms and oops outputs. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bd313ed7c4003f6b1fda63e825325c44a9d837de.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- scripts/faddr2line | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index 450b33257339..29df825d375c 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -105,9 +105,18 @@ __faddr2line() { # In rare cases there might be duplicates. while read symbol; do local fields=($symbol) - local sym_base=0x${fields[1]} - local sym_size=${fields[2]} - local sym_type=${fields[3]} + local sym_base=0x${fields[0]} + local sym_type=${fields[1]} + local sym_end=0x${fields[3]} + + # calculate the size + local sym_size=$(($sym_end - $sym_base)) + if [[ -z $sym_size ]] || [[ $sym_size -le 0 ]]; then + warn "bad symbol size: base: $sym_base end: $sym_end" + DONE=1 + return + fi + sym_size=0x$(printf %x $sym_size) # calculate the address local addr=$(($sym_base + $offset)) @@ -116,26 +125,26 @@ __faddr2line() { DONE=1 return fi - local hexaddr=0x$(printf %x $addr) + addr=0x$(printf %x $addr) # weed out non-function symbols - if [[ $sym_type != "FUNC" ]]; then + if [[ $sym_type != t ]] && [[ $sym_type != T ]]; then [[ $print_warnings = 1 ]] && - echo "skipping $func address at $hexaddr due to non-function symbol" + echo "skipping $func address at $addr due to non-function symbol of type '$sym_type'" continue fi # if the user provided a size, make sure it matches the symbol's size if [[ -n $size ]] && [[ $size -ne $sym_size ]]; then [[ $print_warnings = 1 ]] && - echo "skipping $func address at $hexaddr due to size mismatch ($size != $sym_size)" + echo "skipping $func address at $addr due to size mismatch ($size != $sym_size)" continue; fi # make sure the provided offset is within the symbol's range if [[ $offset -gt $sym_size ]]; then [[ $print_warnings = 1 ]] && - echo "skipping $func address at $hexaddr due to size mismatch ($offset > $sym_size)" + echo "skipping $func address at $addr due to size mismatch ($offset > $sym_size)" continue fi @@ -143,12 +152,12 @@ __faddr2line() { [[ $FIRST = 0 ]] && echo FIRST=0 - local hexsize=0x$(printf %x $sym_size) - echo "$func+$offset/$hexsize:" - addr2line -fpie $objfile $hexaddr | sed "s; $dir_prefix\(\./\)*; ;" + # pass real address to addr2line + echo "$func+$offset/$sym_size:" + addr2line -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;" DONE=1 - done < <(readelf -sW $objfile | awk -v f=$func '$8 == f {print}') + done < <(nm -n $objfile | awk -v fn=$func '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, $1 }') } [[ $# -lt 2 ]] && usage From bb5e5ce545f2031c96f7901cd8d1698ea3ca4c9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:12 -0500 Subject: [PATCH 20/40] x86/dumpstack: Remove kernel text addresses from stack dump Printing kernel text addresses in stack dumps is of questionable value, especially now that address randomization is becoming common. It can be a security issue because it leaks kernel addresses. It also affects the usefulness of the stack dump. Linus says: "I actually spend time cleaning up commit messages in logs, because useless data that isn't actually information (random hex numbers) is actively detrimental. It makes commit logs less legible. It also makes it harder to parse dumps. It's not useful. That makes it actively bad. I probably look at more oops reports than most people. I have not found the hex numbers useful for the last five years, because they are just randomized crap. The stack content thing just makes code scroll off the screen etc, for example." The only real downside to removing these addresses is that they can be used to disambiguate duplicate symbol names. However such cases are rare, and the context of the stack dump should be enough to be able to figure it out. There's now a 'faddr2line' script which can be used to convert a function address to a file name and line: $ ./scripts/faddr2line ~/k/vmlinux write_sysrq_trigger+0x51/0x60 write_sysrq_trigger+0x51/0x60: write_sysrq_trigger at drivers/tty/sysrq.c:1098 Or gdb can be used: $ echo "list *write_sysrq_trigger+0x51" |gdb ~/k/vmlinux |grep "is in" (gdb) 0xffffffff815b5d83 is in driver_probe_device (/home/jpoimboe/git/linux/drivers/base/dd.c:378). (But note that when there are duplicate symbol names, gdb will only show the first symbol it finds. faddr2line is recommended over gdb because it handles duplicates and it also does function size checking.) Here's an example of what a stack dump looks like after this change: BUG: unable to handle kernel NULL pointer dereference at (null) IP: sysrq_handle_crash+0x45/0x80 PGD 36bfa067 [ 29.650644] PUD 7aca3067 Oops: 0002 [#1] PREEMPT SMP Modules linked in: ... CPU: 1 PID: 786 Comm: bash Tainted: G E 4.9.0-rc1+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.1-1.fc24 04/01/2014 task: ffff880078582a40 task.stack: ffffc90000ba8000 RIP: 0010:sysrq_handle_crash+0x45/0x80 RSP: 0018:ffffc90000babdc8 EFLAGS: 00010296 RAX: ffff880078582a40 RBX: 0000000000000063 RCX: 0000000000000001 RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000292 RBP: ffffc90000babdc8 R08: 0000000b31866061 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000007 R14: ffffffff81ee8680 R15: 0000000000000000 FS: 00007ffb43869700(0000) GS:ffff88007d400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007a3e9000 CR4: 00000000001406e0 Stack: ffffc90000babe00 ffffffff81572d08 ffffffff81572bd5 0000000000000002 0000000000000000 ffff880079606600 00007ffb4386e000 ffffc90000babe20 ffffffff81573201 ffff880036a3fd00 fffffffffffffffb ffffc90000babe40 Call Trace: __handle_sysrq+0x138/0x220 ? __handle_sysrq+0x5/0x220 write_sysrq_trigger+0x51/0x60 proc_reg_write+0x42/0x70 __vfs_write+0x37/0x140 ? preempt_count_sub+0xa1/0x100 ? __sb_start_write+0xf5/0x210 ? vfs_write+0x183/0x1a0 vfs_write+0xb8/0x1a0 SyS_write+0x58/0xc0 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: 0033:0x7ffb42f55940 RSP: 002b:00007ffd33bb6b18 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000046 RCX: 00007ffb42f55940 RDX: 0000000000000002 RSI: 00007ffb4386e000 RDI: 0000000000000001 RBP: 0000000000000011 R08: 00007ffb4321ea40 R09: 00007ffb43869700 R10: 00007ffb43869700 R11: 0000000000000246 R12: 0000000000778a10 R13: 00007ffd33bb5c00 R14: 0000000000000007 R15: 0000000000000010 Code: 34 e8 d0 34 bc ff 48 c7 c2 3b 2b 57 81 be 01 00 00 00 48 c7 c7 e0 dd e5 81 e8 a8 55 ba ff c7 05 0e 3f de 00 01 00 00 00 0f ae f8 04 25 00 00 00 00 01 5d c3 e8 4c 49 bc ff 84 c0 75 c3 48 c7 RIP: sysrq_handle_crash+0x45/0x80 RSP: ffffc90000babdc8 CR2: 0000000000000000 Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/69329cb29b8f324bb5fcea14d61d224807fb6488.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kdebug.h | 1 - arch/x86/kernel/dumpstack.c | 18 ++++-------------- arch/x86/kernel/process_32.c | 7 +++---- arch/x86/kernel/process_64.c | 6 +++--- arch/x86/mm/fault.c | 3 +-- arch/x86/platform/uv/uv_nmi.c | 4 ++-- 6 files changed, 13 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index d31881188431..29a594a3b82a 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -21,7 +21,6 @@ enum die_val { DIE_NMIUNKNOWN, }; -extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 64281a1d4e48..f967652500fa 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -46,14 +46,7 @@ static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { touch_nmi_watchdog(); - printk("%s [<%p>] %s%pB\n", - log_lvl, (void *)address, reliable ? "" : "? ", - (void *)address); -} - -void printk_address(unsigned long address) -{ - pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -275,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err) sp = kernel_stack_pointer(regs); savesegment(ss, ss); } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", + (void *)regs->ip, ss, sp); #else /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip); - printk(" RSP <%016lx>\n", regs->sp); + printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); #endif return 0; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..e3223bc78cb6 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -72,10 +72,9 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - (u16)regs->cs, regs->ip, regs->flags, - smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); + printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, + smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f1c36da4c9b5..c99f1ca35eb5 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -61,10 +61,10 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] %pS\n", regs->cs & 0xffff, - regs->ip, (void *)regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, + (void *)regs->ip); printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + regs->sp, regs->flags); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..17c55a536fdd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -679,8 +679,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); - printk(KERN_ALERT "IP:"); - printk_address(regs->ip); + printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); } diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index cd5173a2733f..8410e7d0a5b5 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -387,8 +387,8 @@ static void uv_nmi_dump_cpu_ip_hdr(void) /* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); - printk_address(regs->ip); + pr_info("UV: %4d %6d %-32.32s %pS", + cpu, current->pid, current->comm, (void *)regs->ip); } /* From 0ee1dd9f5e7eae4e55f95935b72d4beecb03de9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:13 -0500 Subject: [PATCH 21/40] x86/dumpstack: Remove raw stack dump For mostly historical reasons, the x86 oops dump shows the raw stack values: ... [registers] Stack: ffff880079af7350 ffff880079905400 0000000000000000 ffffc900008f3ae0 ffffffffa0196610 0000000000000001 00010000ffffffff 0000000087654321 0000000000000002 0000000000000000 0000000000000000 0000000000000000 Call Trace: ... This seems to be an artifact from long ago, and probably isn't needed anymore. It generally just adds noise to the dump, and it can be actively harmful because it leaks kernel addresses. Linus says: "The stack dump actually goes back to forever, and it used to be useful back in 1992 or so. But it used to be useful mainly because stacks were simpler and we didn't have very good call traces anyway. I definitely remember having used them - I just do not remember having used them in the last ten+ years. Of course, it's still true that if you can trigger an oops, you've likely already lost the security game, but since the stack dump is so useless, let's aim to just remove it and make games like the above harder." This also removes the related 'kstack=' cmdline option and the 'kstack_depth_to_print' sysctl. Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e83bd50df52d8fe88e94d2566426ae40d813bf8f.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 3 -- Documentation/sysctl/kernel.txt | 8 ---- Documentation/x86/x86_64/boot-options.txt | 4 -- arch/x86/include/asm/stacktrace.h | 5 --- arch/x86/kernel/dumpstack.c | 21 +-------- arch/x86/kernel/dumpstack_32.c | 33 +------------- arch/x86/kernel/dumpstack_64.c | 53 +---------------------- kernel/sysctl.c | 7 --- 8 files changed, 4 insertions(+), 130 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 37babf91f2cb..049a9172ed22 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1958,9 +1958,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted. kmemcheck=2 (one-shot mode) Default: 2 (one-shot mode) - kstack=N [X86] Print N words from the kernel stack - in oops dumps. - kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ffab8b5caa60..065f18478c1c 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -40,7 +40,6 @@ show up in /proc/sys/kernel: - hung_task_warnings - kexec_load_disabled - kptr_restrict -- kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] - modprobe ==> Documentation/debugging-modules.txt - modules_disabled @@ -395,13 +394,6 @@ When kptr_restrict is set to (2), kernel pointers printed using ============================================================== -kstack_depth_to_print: (X86 only) - -Controls the number of words to print when dumping the raw -kernel stack. - -============================================================== - l2cr: (PPC only) This flag controls the L2 cache of G3 processor boards. If diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 0965a71f9942..61b611e9eeaf 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -277,10 +277,6 @@ IOMMU (input/output memory management unit) space might stop working. Use this option if you have devices that are accessed from userspace directly on some PCI host bridge. -Debugging - - kstack=N Print N words from the kernel stack in oops dumps. - Miscellaneous nogbpages diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 37f2e0b377ad..1e375b05cfa8 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -43,8 +43,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) addr + len > begin && addr + len <= end); } -extern int kstack_depth_to_print; - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else @@ -86,9 +84,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); - extern unsigned int code_bytes; /* The form of the top of the frame on the stack */ diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f967652500fa..499aa6f0fde5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,7 +22,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -171,12 +170,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -295,22 +294,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..90cf460d50bd 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -121,36 +121,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..310abf4542dc 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -140,56 +140,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +157,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..17a5a8253294 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -989,13 +989,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, - { - .procname = "kstack_depth_to_print", - .data = &kstack_depth_to_print, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "io_delay_type", .data = &io_delay_type, From adb1fe9ae2ee6ef6bc10f3d5a588020e7664dfa7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:14 -0500 Subject: [PATCH 22/40] mm/page_alloc: Remove kernel address exposure in free_reserved_area() Linus suggested we try to remove some of the low-hanging fruit related to kernel address exposure in dmesg. The only leaks I see on my local system are: Freeing SMP alternatives memory: 32K (ffffffff9e309000 - ffffffff9e311000) Freeing initrd memory: 10588K (ffffa0b736b42000 - ffffa0b737599000) Freeing unused kernel memory: 3592K (ffffffff9df87000 - ffffffff9e309000) Freeing unused kernel memory: 1352K (ffffa0b7288ae000 - ffffa0b728a00000) Freeing unused kernel memory: 632K (ffffa0b728d62000 - ffffa0b728e00000) Linus says: "I suspect we should just remove [the addresses in the 'Freeing' messages]. I'm sure they are useful in theory, but I suspect they were more useful back when the whole "free init memory" was originally done. These days, if we have a use-after-free, I suspect the init-mem situation is the easiest situation by far. Compared to all the dynamic allocations which are much more likely to show it anyway. So having debug output for that case is likely not all that productive." With this patch the freeing messages now look like this: Freeing SMP alternatives memory: 32K Freeing initrd memory: 10588K Freeing unused kernel memory: 3592K Freeing unused kernel memory: 1352K Freeing unused kernel memory: 632K Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6836ff90c45b71d38e5d4405aec56fa9e5d1d4b2.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b3bf6767d54..3f639739cab9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6508,8 +6508,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } From bdcc18b548b8f1fab23c097724c6f32daac03185 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 25 Oct 2016 22:56:04 +0200 Subject: [PATCH 23/40] x86/decoder: Use stdout if insn decoder test is successful If the instruction decoder test ran successful it prints a message like this to stderr: Succeed: decoded and checked 1767380 instructions But, as described in "console mode programming user interface guidelines version 101" which doesn't exist, programs should use stderr for errors or warnings. We're told about a successful run here, so the instruction decoder test should use stdout. Let's fix the typo too, while we're at it. Signed-off-by: Paul Bolle Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1477428965-20548-2-git-send-email-pebolle@tiscali.nl Signed-off-by: Ingo Molnar --- arch/x86/tools/test_get_len.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 56f04db0c9c0..ecf31e0358c8 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) fprintf(stderr, "Warning: decoded and checked %d" " instructions with %d warnings\n", insns, warnings); else - fprintf(stderr, "Succeed: decoded and checked %d" + fprintf(stdout, "Success: decoded and checked %d" " instructions\n", insns); return 0; } From bb12d6740f6de393927362f23f833a79d85df384 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 25 Oct 2016 22:56:05 +0200 Subject: [PATCH 24/40] x86/decoder: Use stderr if insn sanity test fails If the instruction sanity test fails, it prints a "Failure" message to stdout. Make this program behave like the rest of the build and print that message to stderr. Signed-off-by: Paul Bolle Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1477428965-20548-3-git-send-email-pebolle@tiscali.nl Signed-off-by: Ingo Molnar --- arch/x86/tools/insn_sanity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index ba70ff232917..1972565ab106 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -269,7 +269,8 @@ int main(int argc, char **argv) insns++; } - fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + fprintf((errors) ? stderr : stdout, + "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, (errors) ? "Failure" : "Success", insns, From c32c47c68a0ae701088c5b2c3798856ed16746ae Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 26 Oct 2016 10:41:48 -0500 Subject: [PATCH 25/40] x86/unwind: Warn on bad frame pointer Detect situations in the unwinder where the frame pointer refers to a bad address, and print an appropriate warning. Use printk_deferred_once() because the unwinder can be called with the console lock by lockdep via save_stack_trace(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/03c888f6f7414d54fa56b393ea25482be6899b5f.1477496147.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 579542736b7e..9be9a8f8c5df 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -123,8 +123,17 @@ bool unwind_next_frame(struct unwind_state *state) } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_frame, next_len)) - return false; + if (!update_stack_state(state, next_frame, next_len)) { + /* + * Don't warn on bad regs->bp. An interrupt in entry code + * might cause a false positive warning. + */ + if (state->regs) + goto the_end; + + goto bad_address; + } + /* move to the next frame */ if (regs) { state->regs = regs; @@ -136,6 +145,11 @@ bool unwind_next_frame(struct unwind_state *state) return true; +bad_address: + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_bp); the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; return false; From 0d2b8579add41e08aa1110da864f1071d58e6048 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 26 Oct 2016 10:41:50 -0500 Subject: [PATCH 26/40] x86/dumpstack: Warn on stack recursion Print a warning if stack recursion is detected. Use printk_deferred_once() because the unwinder can be called with the console lock by lockdep via save_stack_trace(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/def18247aafaab480844484398e793f552b79bda.1477496147.git.jpoimboe@redhat.com [ Unbroke the lines. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 4 +++- arch/x86/kernel/dumpstack_64.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 90cf460d50bd..d7d20999d68c 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -109,8 +109,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 310abf4542dc..ab0f8b90b51b 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -128,8 +128,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } From b6959a362177053c1c90db6dc1af25b6bddd8548 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 26 Oct 2016 10:41:51 -0500 Subject: [PATCH 27/40] x86/unwind: Detect bad stack return address If __kernel_text_address() doesn't recognize a return address on the stack, it probably means that it's some generated code which __kernel_text_address() doesn't know about yet. Otherwise there's probably some stack corruption. Either way, warn about it. Use printk_deferred_once() because the unwinder can be called with the console lock by lockdep via save_stack_trace(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2d897898f324e275943b590d160b55e482bba65f.1477496147.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 9be9a8f8c5df..5639db619c03 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -20,7 +20,15 @@ unsigned long unwind_get_return_address(struct unwind_state *state) addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, addr_p); - return __kernel_text_address(addr) ? addr : 0; + if (!__kernel_text_address(addr)) { + printk_deferred_once(KERN_WARNING + "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n", + (void *)addr, addr_p, state->task->comm, + state->task->pid); + return 0; + } + + return addr; } EXPORT_SYMBOL_GPL(unwind_get_return_address); From a01aa6c9f40fe03c82032e7f8b3bcf1e6c93ac0e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 27 Oct 2016 17:15:15 +0300 Subject: [PATCH 28/40] x86/prctl/uapi: Remove #ifdef for CHECKPOINT_RESTORE As userspace knows nothing about kernel config, thus #ifdefs around ABI prctl constants makes them invisible to userspace. Let it be clean'n'simple: remove #ifdefs. If kernel has CONFIG_CHECKPOINT_RESTORE disabled, sys_prctl() will return -EINVAL for those prctls. Reported-by: Paul Bolle Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: oleg@redhat.com Fixes: 2eefd8789698 ("x86/arch_prctl/vdso: Add ARCH_MAP_VDSO_*") Link: http://lkml.kernel.org/r/20161027141516.28447-2-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/prctl.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index ae135de547f5..835aa51c7f6e 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -6,10 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 -#ifdef CONFIG_CHECKPOINT_RESTORE -# define ARCH_MAP_VDSO_X32 0x2001 -# define ARCH_MAP_VDSO_32 0x2002 -# define ARCH_MAP_VDSO_64 0x2003 -#endif +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ From 67dece7d4c5841e84a3c795e79bf0dcd5be54f55 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 27 Oct 2016 17:15:16 +0300 Subject: [PATCH 29/40] x86/vdso: Set vDSO pointer only after success Those pointers were initialized before call to _install_special_mapping() after the commit: f7b6eb3fa072 ("x86: Set context.vdso before installing the mapping") This is not required anymore as special mappings have their vma name and don't use arch_vma_name() after commit: a62c34bd2a8a ("x86, mm: Improve _install_special_mapping and fix x86 vdso naming") So, this way to init looks less entangled. I even belive that we can remove NULL initializers: - on failure load_elf_binary() will not start a new thread; - arch_prctl will have the same pointers as before syscall. Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: oleg@redhat.com Link: http://lkml.kernel.org/r/20161027141516.28447-3-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 23c881caabd1..e739002427ed 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -161,8 +161,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -189,14 +187,12 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size); + } else { + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; } up_fail: - if (ret) { - current->mm->context.vdso = NULL; - current->mm->context.vdso_image = NULL; - } - up_write(&mm->mmap_sem); return ret; } From 24d86f59093b0bcb3756cdf47f2db10ff4e90dbb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 27 Oct 2016 08:10:58 -0500 Subject: [PATCH 30/40] x86/unwind: Ensure stack grows down Add a sanity check to ensure the stack only grows down, and print a warning if the check fails. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161027131058.tpdffwlqipv7pcd6@treble Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 5639db619c03..ea7b7f9a3b9e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -32,6 +32,15 @@ unsigned long unwind_get_return_address(struct unwind_state *state) } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static size_t regs_size(struct pt_regs *regs) +{ + /* x86_32 regs from kernel mode are two words shorter: */ + if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) + return sizeof(*regs) - 2*sizeof(long); + + return sizeof(*regs); +} + static bool is_last_task_frame(struct unwind_state *state) { unsigned long bp = (unsigned long)state->bp; @@ -79,6 +88,7 @@ bool unwind_next_frame(struct unwind_state *state) struct pt_regs *regs; unsigned long *next_bp, *next_frame; size_t next_len; + enum stack_type prev_type = state->stack_info.type; if (unwind_done(state)) return false; @@ -142,6 +152,15 @@ bool unwind_next_frame(struct unwind_state *state) goto bad_address; } + /* Make sure it only unwinds up and doesn't overlap the last frame: */ + if (state->stack_info.type == prev_type) { + if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) + goto bad_address; + + if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) + goto bad_address; + } + /* move to the next frame */ if (regs) { state->regs = regs; @@ -154,10 +173,17 @@ bool unwind_next_frame(struct unwind_state *state) return true; bad_address: - printk_deferred_once(KERN_WARNING - "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", - state->bp, state->task->comm, - state->task->pid, next_bp); + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_frame); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_frame); + } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; return false; From adb402cd1461eef6e1a21db4532a3b9e6a6be853 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 31 Oct 2016 16:10:15 +0100 Subject: [PATCH 31/40] x86/copy_user: Unify the code by removing the 64-bit asm _copy_*_user() variants We already have the same functionality in usercopy_32.c. Share it with 64-bit and get rid of some more asm glue which is not needed anymore. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161031151015.22087-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/lib/copy_user_64.S | 47 ----------------------------------- arch/x86/lib/usercopy.c | 49 +++++++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy_32.c | 49 ------------------------------------- 3 files changed, 49 insertions(+), 96 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b48f88..c5959576c315 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,53 +16,6 @@ #include #include -/* Standard copy_to_user with segment limit checking */ -ENTRY(_copy_to_user) - mov PER_CPU_VAR(current_task), %rax - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_to_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_to_user) -EXPORT_SYMBOL(_copy_to_user) - -/* Standard copy_from_user with segment limit checking */ -ENTRY(_copy_from_user) - mov PER_CPU_VAR(current_task), %rax - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_from_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_from_user) -EXPORT_SYMBOL(_copy_from_user) - - - .section .fixup,"ax" - /* must zero dest */ -ENTRY(bad_from_user) -bad_from_user: - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret -ENDPROC(bad_from_user) - .previous - /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index b4908789484e..c074799bddae 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -34,3 +34,52 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +/** + * copy_to_user: - Copy a block of data into user space. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from kernel space to user space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(_copy_to_user); + +/** + * copy_from_user: - Copy a block of data from user space. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from user space to kernel space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + */ +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) +{ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} +EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3bc7baf2a711..0b281217c890 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -640,52 +640,3 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); From 47f10a36003eaf493125a5e6687dd1ff775bfd8c Mon Sep 17 00:00:00 2001 From: He Chen Date: Fri, 11 Nov 2016 17:25:34 +0800 Subject: [PATCH 32/40] x86/cpuid: Cleanup cpuid_regs definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpuid_regs is defined multiple times as structure and enum. Rename the enum and move all of it to processor.h so we don't end up with more instances. Rename the misnomed register enumeration from CR_* to the obvious CPUID_*. [ tglx: Rewrote changelog ] Signed-off-by: He Chen Reviewed-by: Borislav Petkov Cc: Luwei Kang Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Piotr Luc Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/1478856336-9388-2-git-send-email-he.chen@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/events/intel/pt.c | 45 ++++++++++++++------------------ arch/x86/include/asm/processor.h | 11 ++++++++ arch/x86/kernel/cpu/scattered.c | 28 ++++++++------------ arch/x86/kernel/cpuid.c | 4 --- 4 files changed, 41 insertions(+), 47 deletions(-) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c5047b8f777b..1c1b9fe705c8 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -36,13 +36,6 @@ static DEFINE_PER_CPU(struct pt, pt_ctx); static struct pt_pmu pt_pmu; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - /* * Capabilities of Intel PT hardware, such as number of address bits or * supported output schemes, are cached and exported to userspace as "caps" @@ -64,21 +57,21 @@ static struct pt_cap_desc { u8 reg; u32 mask; } pt_caps[] = { - PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), - PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), - PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), - PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), - PT_CAP(mtc, 0, CR_EBX, BIT(3)), - PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), - PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), - PT_CAP(topa_output, 0, CR_ECX, BIT(0)), - PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), - PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), - PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), - PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), - PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), - PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), + PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), + PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), + PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) @@ -213,10 +206,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); + &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 984a7bf17f6a..8f6ac5be84b7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -137,6 +137,17 @@ struct cpuinfo_x86 { u32 microcode; }; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 1db8dc490b66..dbb470e839f8 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,13 +17,6 @@ struct cpuid_bit { u32 sub_leaf; }; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { u32 max_level; @@ -31,14 +24,14 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { 0, 0, 0, 0, 0 } }; @@ -50,8 +43,9 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2836de390f95..9095c80723d6 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -46,10 +46,6 @@ static struct class *cpuid_class; -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; - static void cpuid_smp_cpuid(void *cmd_block) { struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; From 47bdf3378d62a627cfb8a54e1180c08d67078b61 Mon Sep 17 00:00:00 2001 From: He Chen Date: Fri, 11 Nov 2016 17:25:35 +0800 Subject: [PATCH 33/40] x86/cpuid: Provide get_scattered_cpuid_leaf() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sparse populated CPUID leafs are collected in a software provided leaf to avoid bloat of the x86_capability array, but there is no way to rebuild the real leafs (e.g. for KVM CPUID enumeration) other than rereading the CPUID leaf from the CPU. While this is possible it is problematic as it does not take software disabled features into account. If a feature is disabled on the host it should not be exposed to a guest either. Add get_scattered_cpuid_leaf() which rebuilds the leaf from the scattered cpuid table information and the active CPU features. [ tglx: Rewrote changelog ] Signed-off-by: He Chen Reviewed-by: Borislav Petkov Cc: Luwei Kang Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Piotr Luc Cc: Borislav Petkov Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/1478856336-9388-3-git-send-email-he.chen@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/processor.h | 3 ++ arch/x86/kernel/cpu/scattered.c | 49 ++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8f6ac5be84b7..e7f8c62701d4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -189,6 +189,9 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index dbb470e839f8..d1316f9c8329 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,24 +17,25 @@ struct cpuid_bit { u32 sub_leaf; }; +/* Please keep the leaf sorted by cpuid_bit.level for faster search. */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { 0, 0, 0, 0, 0 } +}; + void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { u32 max_level; u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, - { 0, 0, 0, 0, 0 } - }; - for (cb = cpuid_bits; cb->feature; cb++) { /* Verify that the level is valid */ @@ -51,3 +52,27 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) set_cpu_cap(c, cb->feature); } } + +u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, + enum cpuid_regs_idx reg) +{ + const struct cpuid_bit *cb; + u32 cpuid_val = 0; + + for (cb = cpuid_bits; cb->feature; cb++) { + + if (level > cb->level) + continue; + + if (level < cb->level) + break; + + if (reg == cb->reg && sub_leaf == cb->sub_leaf) { + if (cpu_has(&boot_cpu_data, cb->feature)) + cpuid_val |= BIT(cb->bit); + } + } + + return cpuid_val; +} +EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf); From a8d9df5a509a232a959e4ef2e281f7ecd77810d6 Mon Sep 17 00:00:00 2001 From: Gayatri Kammela Date: Wed, 16 Nov 2016 12:11:00 -0800 Subject: [PATCH 34/40] x86/cpufeatures: Enable new AVX512 cpu features Add a few new AVX512 instruction groups/features for enumeration in /proc/cpuinfo: AVX512IFMA and AVX512VBMI. Clear the flags in fpu_xstate_clear_all_cpu_caps(). CPUID.(EAX=7,ECX=0):EBX[bit 21] AVX512IFMA CPUID.(EAX=7,ECX=0):ECX[bit 1] AVX512VBMI Detailed information of cpuid bits for the features can be found at https://bugzilla.kernel.org/show_bug.cgi?id=187891 Signed-off-by: Gayatri Kammela Reviewed-by: Borislav Petkov Cc: Ravi Shankar Cc: Fenghua Yu Cc: mingo@elte.hu Link: http://lkml.kernel.org/r/1479327060-18668-1-git-send-email-gayatri.kammela@intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 2 ++ arch/x86/kernel/fpu/xstate.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a39629206864..a3e8cd762a23 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -226,6 +226,7 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -279,6 +280,7 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 095ef7ddd6ae..ce47452879fd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); @@ -73,6 +74,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); From f4474c9f0bba17857b1a47c8dc89c07a0845c2b2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 17 Nov 2016 00:04:58 -0600 Subject: [PATCH 35/40] x86/dumpstack: Handle NULL stack pointer in show_trace_log_lvl() When show_trace_log_lvl() is called from show_regs(), it completely fails to dump the stack. This bug was introduced when show_stack_log_lvl() was removed with the following commit: 0ee1dd9f5e7e ("x86/dumpstack: Remove raw stack dump") Previous callers of that function now call show_trace_log_lvl() directly. That resulted in a subtle change, in that the 'stack' argument can now be NULL in certain cases. A NULL 'stack' pointer means that the stack dump should start from the topmost stack frame unless 'regs' is valid, in which case it should start from 'regs->sp'. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 0ee1dd9f5e7e ("x86/dumpstack: Remove raw stack dump") Link: http://lkml.kernel.org/r/c551842302a9c222d96a14e42e4003f059509f69.1479362652.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 499aa6f0fde5..1e057b01b648 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -59,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); /* * Iterate through the stacks, starting with the current stack pointer. From a582c540ac1b10f0a7d37415e04c4af42409fd08 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 16 Nov 2016 10:23:27 -0800 Subject: [PATCH 36/40] x86/vdso: Use RDPID in preference to LSL when available RDPID is a new instruction that reads MSR_TSC_AUX quickly. This should be considerably faster than reading the GDT. Add a cpufeature for it and use it from __vdso_getcpu() when available. Tested-by: Megha Dey Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4f6c3a22012d10f1c65b9ca15800e01b42c7d39d.1479320367.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/vgtod.h | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a3e8cd762a23..eac7572bf8bb 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -283,6 +283,7 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index e728699db774..3a01996db58f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void) * works on all CPUs. This is volatile so that it orders * correctly wrt barrier() and to keep gcc from cleverly * hoisting it out of the calling function. + * + * If RDPID is available, use it. */ - asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + alternative_io ("lsl %[p],%[seg]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); return p; } From 3200ca806942d3d8f78d0d0c822e345dfb8789e7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 16 Nov 2016 10:23:28 -0800 Subject: [PATCH 37/40] selftests/x86: Add test_vdso to test getcpu() I'll eventually add tests for more vDSO functions here. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Megha Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/945cd29901a62a3cc6ea7d6ee5e389ab1ec1ac0c.1479320367.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/test_vdso.c | 123 ++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/test_vdso.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index a89f80a5b711..8c1cb423cfe6 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -6,7 +6,7 @@ include ../lib.mk TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ check_initial_reg_state sigreturn ldt_gdt iopl \ - protection_keys + protection_keys test_vdso TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c new file mode 100644 index 000000000000..65d7a2bf7e14 --- /dev/null +++ b/tools/testing/selftests/x86/test_vdso.c @@ -0,0 +1,123 @@ +/* + * ldt_gdt.c - Test cases for LDT and GDT access + * Copyright (c) 2011-2015 Andrew Lutomirski + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef SYS_getcpu +# ifdef __x86_64__ +# define SYS_getcpu 309 +# else +# define SYS_getcpu 318 +# endif +#endif + +int nerrs = 0; + +#ifdef __x86_64__ +# define VSYS(x) (x) +#else +# define VSYS(x) 0 +#endif + +typedef long (*getcpu_t)(unsigned *, unsigned *, void *); + +const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); +getcpu_t vdso_getcpu; + +void fill_function_pointers() +{ + void *vdso = dlopen("linux-vdso.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) + vdso = dlopen("linux-gate.so.1", + RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); + if (!vdso) { + printf("[WARN]\tfailed to find vDSO\n"); + return; + } + + vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu"); + if (!vdso_getcpu) + printf("Warning: failed to find getcpu in vDSO\n"); +} + +static long sys_getcpu(unsigned * cpu, unsigned * node, + void* cache) +{ + return syscall(__NR_getcpu, cpu, node, cache); +} + +static void test_getcpu(void) +{ + printf("[RUN]\tTesting getcpu...\n"); + + for (int cpu = 0; ; cpu++) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + return; + + unsigned cpu_sys, cpu_vdso, cpu_vsys, + node_sys, node_vdso, node_vsys; + long ret_sys, ret_vdso = 1, ret_vsys = 1; + unsigned node; + + ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); + if (vdso_getcpu) + ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); + if (vgetcpu) + ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); + + if (!ret_sys) + node = node_sys; + else if (!ret_vdso) + node = node_vdso; + else if (!ret_vsys) + node = node_vsys; + + bool ok = true; + if (!ret_sys && (cpu_sys != cpu || node_sys != node)) + ok = false; + if (!ret_vdso && (cpu_vdso != cpu || node_vdso != node)) + ok = false; + if (!ret_vsys && (cpu_vsys != cpu || node_vsys != node)) + ok = false; + + printf("[%s]\tCPU %u:", ok ? "OK" : "FAIL", cpu); + if (!ret_sys) + printf(" syscall: cpu %u, node %u", cpu_sys, node_sys); + if (!ret_vdso) + printf(" vdso: cpu %u, node %u", cpu_vdso, node_vdso); + if (!ret_vsys) + printf(" vsyscall: cpu %u, node %u", cpu_vsys, + node_vsys); + printf("\n"); + + if (!ok) + nerrs++; + } +} + +int main(int argc, char **argv) +{ + fill_function_pointers(); + + test_getcpu(); + + return nerrs ? 1 : 0; +} From 3d02a9c48d479eb58841805baaf93c5a084b6010 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Nov 2016 11:46:23 -0600 Subject: [PATCH 38/40] x86/dumpstack: Make stack name tags more comprehensible NMI stack dumps are bracketed by the following tags: ... The ending tag is kind of confusing if you don't already know what "EOE" means (end of exception). The same ending tag is also used to mark the end of all other exceptions' stacks. For example: <#DF> ... And similarly, "EOI" is used as the ending tag for interrupts: ... Change the tags to be more comprehensible by making them symmetrical and more XML-esque: ... <#DF> ... ... Signed-off-by: Josh Poimboeuf Acked-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/180196e3754572540b595bc56b947d43658979a7.1479491159.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/stacktrace.h | 3 +-- arch/x86/kernel/dumpstack.c | 12 ++++++------ arch/x86/kernel/dumpstack_32.c | 19 ++++++++----------- arch/x86/kernel/dumpstack_64.c | 22 ++++++++-------------- 4 files changed, 23 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 1e375b05cfa8..a3269c897ec5 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -30,8 +30,7 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); -void stack_type_str(enum stack_type type, const char **begin, - const char **end); +const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1e057b01b648..0e5c9d0f6c28 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -76,7 +76,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack */ for (regs = NULL; stack; stack = stack_info.next_sp) { - const char *str_begin, *str_end; + const char *stack_name; /* * If we overflowed the task stack into a guard page, jump back @@ -88,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - stack_type_str(stack_info.type, &str_begin, &str_end); - if (str_begin) - printk("%s <%s>\n", log_lvl, str_begin); + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); /* * Scan the stack, printing any text addresses we find. At the @@ -155,8 +155,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, __show_regs(regs, 0); } - if (str_end) - printk("%s <%s>\n", log_lvl, str_end); + if (stack_name) + printk("%s \n", log_lvl, stack_name); } } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index d7d20999d68c..bb3b5b9a6899 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,18 +16,15 @@ #include -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { - switch (type) { - case STACK_TYPE_IRQ: - case STACK_TYPE_SOFTIRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + return NULL; } static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ab0f8b90b51b..fac189efcc34 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - switch (type) { - case STACK_TYPE_IRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; - *end = "EOE"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; } static bool in_exception_stack(unsigned long *stack, struct stack_info *info) From 9b032d21f6482ee305dcdec418c15153614b1dcc Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 24 Nov 2016 22:05:50 +0100 Subject: [PATCH 39/40] x86/boot/64: Use defines for page size ... instead of naked numbers like the rest of the asm does in this file. No code changed: # arch/x86/kernel/head_64.o: text data bss dec hex filename 1124 290864 4096 296084 48494 head_64.o.before 1124 290864 4096 296084 48494 head_64.o.after md5: 87086e202588939296f66e892414ffe2 head_64.o.before.asm 87086e202588939296f66e892414ffe2 head_64.o.after.asm Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20161124210550.15025-1-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07cd27e33a5..a15d381e6020 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -112,20 +112,20 @@ startup_64: movq %rdi, %rax shrq $PGDIR_SHIFT, %rax - leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx movq %rdx, 0(%rbx,%rax,8) movq %rdx, 8(%rbx,%rax,8) - addq $4096, %rdx + addq $PAGE_SIZE, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) incl %eax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) - addq $8192, %rbx + addq $PAGE_SIZE * 2, %rbx movq %rdi, %rax shrq $PMD_SHIFT, %rdi addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax From 53938ee427bf27525a63721b7e25d86b8f31f161 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 28 Nov 2016 17:06:35 -0600 Subject: [PATCH 40/40] scripts/decode_stacktrace.sh: Fix address line detection on x86 Kirill reported that the decode_stacktrace.sh script was broken by the following commit: bb5e5ce545f2 ("x86/dumpstack: Remove kernel text addresses from stack dump") Fix it by updating the per-line absolute address check to also check for function-based address lines like the following: write_sysrq_trigger+0x51/0x60 I didn't remove the check for absolute addresses because it's still needed for ARM. Reported-by: Kirill A. Shutemov Tested-by: Kirill A. Shutemov Signed-off-by: Josh Poimboeuf Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Fixes: bb5e5ce545f2 ("x86/dumpstack: Remove kernel text addresses from stack dump") Link: http://lkml.kernel.org/r/20161128230635.4n2ofgawltgexgcg@treble Signed-off-by: Ingo Molnar --- scripts/decode_stacktrace.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index c332684e1b5a..5206d99ddeb8 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -139,7 +139,8 @@ handle_line() { while read line; do # Let's see if we have an address in the line - if [[ $line =~ \[\<([^]]+)\>\] ]]; then + if [[ $line =~ \[\<([^]]+)\>\] ]] || + [[ $line =~ [^+\ ]+\+0x[0-9a-f]+/0x[0-9a-f]+ ]]; then # Translate address to line numbers handle_line "$line" # Is it a code line?