diff options
Diffstat (limited to 'arch/x86/include/asm')
98 files changed, 2191 insertions, 1376 deletions
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 0ab4f9fd2687..3a45668f6dc3 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -50,6 +50,7 @@ void acpi_pic_sci_set_trigger(unsigned int, u16); extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi, int trigger, int polarity); +extern void (*__acpi_unregister_gsi)(u32 gsi); static inline void disable_acpi(void) { diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,12 +18,63 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,8 +48,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -76,50 +77,69 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num -#define b_replacement(number) "663"#number -#define e_replacement(number) "664"#number +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" -#define alt_slen "662b-661b" -#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" -#define ALTINSTR_ENTRY(feature, number) \ +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas works with s32s. + */ +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ + "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ " .long 661b - .\n" /* label */ \ - " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte " alt_slen "\n" /* source len */ \ - " .byte " alt_rlen(number) "\n" /* replacement len */ - -#define DISCARD_ENTRY(number) /* rlen <= slen */ \ - " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ -#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ - b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr) \ + OLDINSTR(oldinstr, 1) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR(oldinstr) \ + OLDINSTR_2(oldinstr, 1, 2) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - DISCARD_ENTRY(2) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ @@ -146,6 +166,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 465b309af254..976b86a325e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } @@ -106,7 +106,14 @@ extern u32 native_safe_apic_wait_icr_idle(void); extern void native_apic_icr_write(u32 low, u32 id); extern u64 native_apic_icr_read(void); -extern int x2apic_mode; +static inline bool apic_is_x2apic_enabled(void) +{ + u64 msr; + + if (rdmsrl_safe(MSR_IA32_APICBASE, &msr)) + return false; + return msr & X2APIC_ENABLE; +} #ifdef CONFIG_X86_X2APIC /* @@ -169,48 +176,23 @@ static inline u64 native_x2apic_icr_read(void) return val; } +extern int x2apic_mode; extern int x2apic_phys; -extern int x2apic_preenabled; -extern void check_x2apic(void); -extern void enable_x2apic(void); +extern void __init check_x2apic(void); +extern void x2apic_setup(void); static inline int x2apic_enabled(void) { - u64 msr; - - if (!cpu_has_x2apic) - return 0; - - rdmsrl(MSR_IA32_APICBASE, msr); - if (msr & X2APIC_ENABLE) - return 1; - return 0; + return cpu_has_x2apic && apic_is_x2apic_enabled(); } #define x2apic_supported() (cpu_has_x2apic) -static inline void x2apic_force_phys(void) -{ - x2apic_phys = 1; -} #else -static inline void disable_x2apic(void) -{ -} -static inline void check_x2apic(void) -{ -} -static inline void enable_x2apic(void) -{ -} -static inline int x2apic_enabled(void) -{ - return 0; -} -static inline void x2apic_force_phys(void) -{ -} +static inline void check_x2apic(void) { } +static inline void x2apic_setup(void) { } +static inline int x2apic_enabled(void) { return 0; } -#define x2apic_preenabled 0 -#define x2apic_supported() 0 +#define x2apic_mode (0) +#define x2apic_supported() (0) #endif extern void enable_IR_x2apic(void); @@ -219,22 +201,29 @@ extern int get_physical_broadcast(void); extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); -extern void connect_bsp_APIC(void); extern void disconnect_bsp_APIC(int virt_wire_setup); extern void disable_local_APIC(void); extern void lapic_shutdown(void); -extern int verify_local_APIC(void); extern void sync_Arb_IDs(void); extern void init_bsp_APIC(void); extern void setup_local_APIC(void); -extern void end_local_APIC_setup(void); -extern void bsp_end_local_APIC_setup(void); extern void init_apic_mappings(void); void register_lapic_address(unsigned long address); extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); + +#ifdef CONFIG_X86_64 +static inline int apic_force_enable(unsigned long addr) +{ + return -1; +} +#else extern int apic_force_enable(unsigned long addr); +#endif + +extern int apic_bsp_setup(bool upmode); +extern void apic_ap_setup(void); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 0f4460b5636d..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -24,78 +24,28 @@ #define wmb() asm volatile("sfence" ::: "memory") #endif -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() #ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() +#define dma_rmb() rmb() #else -# define smp_rmb() barrier() +#define dma_rmb() barrier() #endif +#define dma_wmb() barrier() + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) #define set_mb(var, value) do { var = value; barrier(); } while (0) #endif /* SMP */ +#define read_barrier_depends() do { } while (0) +#define smp_read_barrier_depends() do { } while (0) + #if defined(CONFIG_X86_PPRO_FENCE) /* @@ -145,13 +95,11 @@ do { \ * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined * code region. - * - * (Could use an alternative three way for this if there was one.) */ static __always_inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9863ee3747da..47c8e32f621a 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -5,65 +5,6 @@ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> -#ifdef CONFIG_X86_PAT -/* - * X86 PAT uses page flags WC and Uncached together to keep track of - * memory type of pages that have backing page struct. X86 PAT supports 3 - * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and - * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not - * been changed from its default (value of -1 used to denote this). - * Note we do not support _PAGE_CACHE_UC here. - */ - -#define _PGMT_DEFAULT 0 -#define _PGMT_WC (1UL << PG_arch_1) -#define _PGMT_UC_MINUS (1UL << PG_uncached) -#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) -#define _PGMT_CLEAR_MASK (~_PGMT_MASK) - -static inline unsigned long get_page_memtype(struct page *pg) -{ - unsigned long pg_flags = pg->flags & _PGMT_MASK; - - if (pg_flags == _PGMT_DEFAULT) - return -1; - else if (pg_flags == _PGMT_WC) - return _PAGE_CACHE_WC; - else if (pg_flags == _PGMT_UC_MINUS) - return _PAGE_CACHE_UC_MINUS; - else - return _PAGE_CACHE_WB; -} - -static inline void set_page_memtype(struct page *pg, unsigned long memtype) -{ - unsigned long memtype_flags = _PGMT_DEFAULT; - unsigned long old_flags; - unsigned long new_flags; - - switch (memtype) { - case _PAGE_CACHE_WC: - memtype_flags = _PGMT_WC; - break; - case _PAGE_CACHE_UC_MINUS: - memtype_flags = _PGMT_UC_MINUS; - break; - case _PAGE_CACHE_WB: - memtype_flags = _PGMT_WB; - break; - } - - do { - old_flags = pg->flags; - new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); -} -#else -static inline unsigned long get_page_memtype(struct page *pg) { return -1; } -static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } -#endif - /* * The set_memory_* API can be used to change various attributes of a virtual * address range. The attributes include: diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 76659b67fd11..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -55,144 +55,157 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 - -/* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -/* end of arguments */ - -/* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 - -#define ARGOFFSET R11 -#define SWFRAME ORIG_RAX - - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 - subq $9*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq_cfi rdi, 8*8 - movq_cfi rsi, 7*8 - movq_cfi rdx, 6*8 - - .if \save_rcx - movq_cfi rcx, 5*8 - .endif +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + subq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 15*8+\addskip + .endm - .if \rax_enosys - movq $-ENOSYS, 4*8(%rsp) - .else - movq_cfi rax, 4*8 + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq_cfi r11, 6*8+\offset .endif - - .if \save_r891011 - movq_cfi r8, 3*8 - movq_cfi r9, 2*8 - movq_cfi r10, 1*8 - movq_cfi r11, 0*8 + .if \r8910 + movq_cfi r10, 7*8+\offset + movq_cfi r9, 8*8+\offset + movq_cfi r8, 9*8+\offset + .endif + .if \rax + movq_cfi rax, 10*8+\offset + .endif + .if \rcx + movq_cfi rcx, 11*8+\offset .endif + movq_cfi rdx, 12*8+\offset + movq_cfi rsi, 13*8+\offset + movq_cfi rdi, 14*8+\offset + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + + .macro SAVE_EXTRA_REGS offset=0 + movq_cfi r15, 0*8+\offset + movq_cfi r14, 1*8+\offset + movq_cfi r13, 2*8+\offset + movq_cfi r12, 3*8+\offset + movq_cfi rbp, 4*8+\offset + movq_cfi rbx, 5*8+\offset + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq_cfi rbp, 4*8+\offset + .endm + .macro RESTORE_EXTRA_REGS offset=0 + movq_cfi_restore 0*8+\offset, r15 + movq_cfi_restore 1*8+\offset, r14 + movq_cfi_restore 2*8+\offset, r13 + movq_cfi_restore 3*8+\offset, r12 + movq_cfi_restore 4*8+\offset, rbp + movq_cfi_restore 5*8+\offset, rbx .endm -#define ARG_SKIP (9*8) + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ - rstor_r8910=1, rstor_rdx=1 + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 0*8, r11 + movq_cfi_restore 6*8, r11 .endif - .if \rstor_r8910 - movq_cfi_restore 1*8, r10 - movq_cfi_restore 2*8, r9 - movq_cfi_restore 3*8, r8 + movq_cfi_restore 7*8, r10 + movq_cfi_restore 8*8, r9 + movq_cfi_restore 9*8, r8 .endif - .if \rstor_rax - movq_cfi_restore 4*8, rax + movq_cfi_restore 10*8, rax .endif - .if \rstor_rcx - movq_cfi_restore 5*8, rcx + movq_cfi_restore 11*8, rcx .endif - .if \rstor_rdx - movq_cfi_restore 6*8, rdx - .endif - - movq_cfi_restore 7*8, rsi - movq_cfi_restore 8*8, rdi - - .if ARG_SKIP+\addskip > 0 - addq $ARG_SKIP+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + movq_cfi_restore 12*8, rdx .endif + movq_cfi_restore 13*8, rsi + movq_cfi_restore 14*8, rdi .endm - - .macro LOAD_ARGS offset, skiprax=0 - movq \offset(%rsp), %r11 - movq \offset+8(%rsp), %r10 - movq \offset+16(%rsp), %r9 - movq \offset+24(%rsp), %r8 - movq \offset+40(%rsp), %rcx - movq \offset+48(%rsp), %rdx - movq \offset+56(%rsp), %rsi - movq \offset+64(%rsp), %rdi - .if \skiprax - .else - movq \offset+72(%rsp), %rax - .endif + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 .endm - -#define REST_SKIP (6*8) - - .macro SAVE_REST - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq_cfi rbx, 5*8 - movq_cfi rbp, 4*8 - movq_cfi r12, 3*8 - movq_cfi r13, 2*8 - movq_cfi r14, 1*8 - movq_cfi r15, 0*8 + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 .endm - - .macro RESTORE_REST - movq_cfi_restore 0*8, r15 - movq_cfi_restore 1*8, r14 - movq_cfi_restore 2*8, r13 - movq_cfi_restore 3*8, r12 - movq_cfi_restore 4*8, rbp - movq_cfi_restore 5*8, rbx - addq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET -(REST_SKIP) + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 .endm - - .macro SAVE_ALL - SAVE_ARGS - SAVE_REST + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 .endm - .macro RESTORE_ALL addskip=0 - RESTORE_REST - RESTORE_ARGS 1, \addskip + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + addq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -211,37 +224,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx .endm .macro RESTORE_ALL - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg ebx + popl_cfi_reg ecx + popl_cfi_reg edx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebp + popl_cfi_reg eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = this_cpu_read(old_rsp) - 128; + sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0bb1335313b2..854c04b3c9c2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -174,6 +174,7 @@ #define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ #define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ /* @@ -189,6 +190,11 @@ #define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */ +#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */ +#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */ +#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */ +#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -225,7 +231,9 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ @@ -383,6 +391,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) #define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) +#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT) #if __GNUC__ >= 4 extern void warn_pre_alternatives(void); @@ -411,6 +420,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* 1: do replace */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); @@ -425,6 +435,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -450,6 +461,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -476,31 +488,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { #ifdef CC_HAVE_ASM_GOTO -/* - * We need to spell the jumps to the compiler because, depending on the offset, - * the replacement jump can be bigger than the original jump, and this we cannot - * have. Thus, we force the jump to the widest, 4-byte, signed relative - * offset even though the last would often fit in less bytes. - */ - asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ + " .long 4f - .\n" /* repl offset */ " .word %P1\n" /* always replace */ - " .byte 2b - 1b\n" /* src len */ - " .byte 4f - 3f\n" /* repl len */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "3: .byte 0xe9\n .long %l[t_no] - 2b\n" - "4:\n" + "4: jmp %l[t_no]\n" + "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* src len */ + " .byte 3b - 1b\n" /* src len */ " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" : : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : t_dynamic, t_no); @@ -520,6 +531,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P2\n" /* always replace */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -534,6 +546,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P1\n" /* feature bit */ " .byte 4b - 3b\n" /* src len */ " .byte 6f - 5f\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 61fd18b83b6c..12cb66f6d3a5 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -114,5 +114,10 @@ static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_dec(void) { } #endif /* X86_64 */ +#ifdef CONFIG_CPU_SUP_AMD +extern void set_dr_addr_mask(unsigned long mask, int dr); +#else +static inline void set_dr_addr_mask(unsigned long mask, int dr) { } +#endif #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 50d033a8947d..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -251,7 +251,8 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } -#define _LDT_empty(info) \ +/* This intentionally ignores lm, since 32-bit apps don't have that field. */ +#define LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ @@ -261,11 +262,18 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) (info)->seg_not_present == 1 && \ (info)->useable == 0) -#ifdef CONFIG_X86_64 -#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) -#else -#define LDT_empty(info) (_LDT_empty(info)) -#endif +/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */ +static inline bool LDT_zero(const struct user_desc *info) +{ + return (info->base_addr == 0 && + info->limit == 0 && + info->contents == 0 && + info->read_exec_only == 0 && + info->seg_32bit == 0 && + info->limit_in_pages == 0 && + info->seg_not_present == 0 && + info->useable == 0); +} static inline void clear_LDT(void) { @@ -368,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -#define set_intr_gate(n, addr) \ +#define set_intr_gate_notrace(n, addr) \ do { \ BUG_ON((unsigned)n > 0xFF); \ _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ __KERNEL_CS); \ + } while (0) + +#define set_intr_gate(n, addr) \ + do { \ + set_intr_gate_notrace(n, addr); \ _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 0, 0, __KERNEL_CS); \ } while (0) diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 97534a7d38e3..f226df064660 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,6 +10,12 @@ * cpu_feature_enabled(). */ +#ifdef CONFIG_X86_INTEL_MPX +# define DISABLE_MPX 0 +#else +# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31)) +#endif + #ifdef CONFIG_X86_64 # define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) @@ -34,6 +40,6 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 0 #define DISABLED_MASK8 0 -#define DISABLED_MASK9 0 +#define DISABLED_MASK9 (DISABLE_MPX) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 0bdb0c54d9a1..fe884e18fa6e 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -70,7 +70,7 @@ #define MAX_DMA_CHANNELS 8 /* 16MB ISA DMA zone */ -#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -86,11 +86,23 @@ CFI_ADJUST_CFA_OFFSET 8 .endm + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + .macro pushfq_cfi pushfq CFI_ADJUST_CFA_OFFSET 8 @@ -116,11 +128,23 @@ CFI_ADJUST_CFA_OFFSET 4 .endm + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popl_cfi reg popl \reg CFI_ADJUST_CFA_OFFSET -4 .endm + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + .macro pushfl_cfi pushfl CFI_ADJUST_CFA_OFFSET 4 diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 9b11757975d0..3738b138b843 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -2,6 +2,8 @@ #define _ASM_X86_EFI_H #include <asm/i387.h> +#include <asm/pgtable.h> + /* * We map the EFI regions needed for runtime services non-contiguously, * with preserved alignment on virtual addresses starting from -4G down @@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, extern struct efi_scratch efi_scratch; extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); -extern void __init efi_call_phys_prolog(void); -extern void __init efi_call_phys_epilog(void); +extern pgd_t * __init efi_call_phys_prolog(void); +extern void __init efi_call_phys_epilog(pgd_t *save_pgd); extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); @@ -158,6 +160,30 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( } #endif /* CONFIG_EFI_MIXED */ + +/* arch specific definitions used by the stub code */ + +struct efi_config { + u64 image_handle; + u64 table; + u64 allocate_pool; + u64 allocate_pages; + u64 get_memory_map; + u64 free_pool; + u64 free_pages; + u64 locate_handle; + u64 handle_protocol; + u64 exit_boot_services; + u64 text_output; + efi_status_t (*call)(unsigned long, ...); + bool is64; +} __packed; + +__pure const struct efi_config *__efi_early(void); + +#define efi_call_early(f, ...) \ + __efi_early()->call(__efi_early()->f, __VA_ARGS__); + extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..935588d95c82 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,10 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - regs->ax = regs->bx = regs->cx = regs->dx = 0; - regs->si = regs->di = regs->bp = 0; + /* Commented-out registers are cleared in stub_execve */ + /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; + regs->si = regs->di /*= regs->bp*/ = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; @@ -365,6 +366,7 @@ enum align_flags { struct va_alignment { int flags; unsigned long mask; + unsigned long bits; } ____cacheline_aligned; extern struct va_alignment va_align; diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h index 2519d0679d99..c3dd5e71f439 100644 --- a/arch/x86/include/asm/fb.h +++ b/arch/x86/include/asm/fb.h @@ -8,8 +8,12 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma, unsigned long off) { + unsigned long prot; + + prot = pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK; if (boot_cpu_data.x86 > 3) - pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; + pgprot_val(vma->vm_page_prot) = + prot | cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS); } extern int fb_is_primary_device(struct fb_info *info); diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index ffb1733ac91f..f80d70009ff8 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -69,7 +69,9 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, #else +#ifdef CONFIG_X86_VSYSCALL_EMULATION VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, +#endif #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, @@ -136,9 +138,7 @@ enum fixed_addresses { extern void reserve_top_address(unsigned long reserve); #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) -#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) extern int fixmaps_set; diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index e97622f57722..da5e96756570 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -67,6 +67,34 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft); static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif +/* + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. + * + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. + */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + +/* + * Used to indicate that the FPU state in memory is newer than the FPU + * state in registers, and the FPU state should be reloaded next time the + * task is run. Only safe on the current task, or non-running tasks. + */ +static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk) +{ + tsk->thread.fpu.last_cpu = ~0; +} + +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == this_cpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} + static inline int is_ia32_compat_frame(void) { return config_enabled(CONFIG_IA32_EMULATION) && @@ -107,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void) static inline void fx_finit(struct i387_fxsave_struct *fx) { - memset(fx, 0, xstate_size); fx->cwd = 0x37f; fx->mxcsr = MXCSR_DEFAULT; } @@ -207,7 +234,7 @@ static inline void fpu_fxsave(struct fpu *fpu) if (config_enabled(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); else if (config_enabled(CONFIG_AS_FXSAVEQ)) - asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); + asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory * operand uses any extended registers for addressing, a second @@ -290,9 +317,11 @@ static inline int fpu_restore_checking(struct fpu *fpu) static inline int restore_fpu_checking(struct task_struct *tsk) { - /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. "m" is a random variable that should be in L1 */ + /* + * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is + * pending. Clear the x87 state here by setting it to fixed values. + * "m" is a random variable that should be in L1. + */ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" @@ -349,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk) __thread_set_has_fpu(tsk); } -static inline void __drop_fpu(struct task_struct *tsk) +static inline void drop_fpu(struct task_struct *tsk) { + /* + * Forget coprocessor state.. + */ + preempt_disable(); + tsk->thread.fpu_counter = 0; + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" @@ -358,30 +393,29 @@ static inline void __drop_fpu(struct task_struct *tsk) _ASM_EXTABLE(1b, 2b)); __thread_fpu_end(tsk); } + + clear_stopped_child_used_math(tsk); + preempt_enable(); } -static inline void drop_fpu(struct task_struct *tsk) +static inline void restore_init_xstate(void) { - /* - * Forget coprocessor state.. - */ - preempt_disable(); - tsk->thread.fpu_counter = 0; - __drop_fpu(tsk); - clear_used_math(); - preempt_enable(); + if (use_xsave()) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); } -static inline void drop_init_fpu(struct task_struct *tsk) +/* + * Reset the FPU state in the eager case and drop it in the lazy case (later use + * will reinit it). + */ +static inline void fpu_reset_state(struct task_struct *tsk) { if (!use_eager_fpu()) drop_fpu(tsk); - else { - if (use_xsave()) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); - } + else + restore_init_xstate(); } /* @@ -398,24 +432,6 @@ static inline void drop_init_fpu(struct task_struct *tsk) */ typedef struct { int preload; } fpu_switch_t; -/* - * Must be run with preemption disabled: this clears the fpu_owner_task, - * on this CPU. - * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. - */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) -{ - per_cpu(fpu_owner_task, cpu) = NULL; -} - -static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) -{ - return new == this_cpu_read_stable(fpu_owner_task) && - cpu == new->thread.fpu.last_cpu; -} - static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; @@ -424,13 +440,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta * If the task has used the math, pre-load the FPU on xsave processors * or if the past 5 consecutive context-switches used math. */ - fpu.preload = tsk_used_math(new) && (use_eager_fpu() || - new->thread.fpu_counter > 5); + fpu.preload = tsk_used_math(new) && + (use_eager_fpu() || new->thread.fpu_counter > 5); + if (__thread_has_fpu(old)) { if (!__save_init_fpu(old)) - cpu = ~0; - old->thread.fpu.last_cpu = cpu; - old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + task_disable_lazy_fpu_restore(old); + else + old->thread.fpu.last_cpu = cpu; + + /* But leave fpu_owner_task! */ + old->thread.fpu.has_fpu = 0; /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -441,10 +461,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->thread.fpu_counter = 0; - old->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(old); if (fpu.preload) { new->thread.fpu_counter++; - if (!use_eager_fpu() && fpu_lazy_restore(new, cpu)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -464,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu) { if (fpu.preload) { if (unlikely(restore_fpu_checking(new))) - drop_init_fpu(new); + fpu_reset_state(new); } } @@ -493,10 +513,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame) } /* - * Need to be preemption-safe. + * Needs to be preemption-safe. * * NOTE! user_fpu_begin() must be used only immediately before restoring - * it. This function does not do any save/restore on their own. + * the save state. It does not do any saving/restoring on its own. In + * lazy FPU mode, it is just an optimization to avoid a #NM exception, + * the task can lose the FPU right after preempt_enable(). */ static inline void user_fpu_begin(void) { @@ -518,24 +540,6 @@ static inline void __save_fpu(struct task_struct *tsk) } /* - * These disable preemption on their own and are safe - */ -static inline void save_init_fpu(struct task_struct *tsk) -{ - WARN_ON_ONCE(!__thread_has_fpu(tsk)); - - if (use_eager_fpu()) { - __save_fpu(tsk); - return; - } - - preempt_disable(); - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - preempt_enable(); -} - -/* * i387 state interaction */ static inline unsigned short get_fpu_cwd(struct task_struct *tsk) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index e1f7fecaa7d6..f45acad3c4b6 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -1,39 +1,6 @@ #ifndef _ASM_X86_FTRACE_H #define _ASM_X86_FTRACE_H -#ifdef __ASSEMBLY__ - - /* skip is set if the stack was already partially adjusted */ - .macro MCOUNT_SAVE_FRAME skip=0 - /* - * We add enough stack to save all regs. - */ - subq $(SS+8-\skip), %rsp - movq %rax, RAX(%rsp) - movq %rcx, RCX(%rsp) - movq %rdx, RDX(%rsp) - movq %rsi, RSI(%rsp) - movq %rdi, RDI(%rsp) - movq %r8, R8(%rsp) - movq %r9, R9(%rsp) - /* Move RIP to its proper location */ - movq SS+8(%rsp), %rdx - movq %rdx, RIP(%rsp) - .endm - - .macro MCOUNT_RESTORE_FRAME skip=0 - movq R9(%rsp), %r9 - movq R8(%rsp), %r8 - movq RDI(%rsp), %rdi - movq RSI(%rsp), %rsi - movq RDX(%rsp), %rdx - movq RCX(%rsp), %rcx - movq RAX(%rsp), %rax - addq $(SS+8-\skip), %rsp - .endm - -#endif - #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY # define MCOUNT_ADDR ((long)(__fentry__)) diff --git a/arch/x86/include/asm/hash.h b/arch/x86/include/asm/hash.h deleted file mode 100644 index e8c58f88b1d4..000000000000 --- a/arch/x86/include/asm/hash.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_HASH_H -#define _ASM_X86_HASH_H - -struct fast_hash_ops; -extern void setup_arch_fast_hash(struct fast_hash_ops *ops); - -#endif /* _ASM_X86_HASH_H */ diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 302a323b3f67..04e9d023168f 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -38,17 +38,20 @@ extern unsigned long highstart_pfn, highend_pfn; /* * Ordering is: * - * FIXADDR_TOP - * fixed_addresses - * FIXADDR_START - * temp fixed addresses - * FIXADDR_BOOT_START - * Persistent kmap area - * PKMAP_BASE - * VMALLOC_END - * Vmalloc area - * VMALLOC_START - * high_memory + * high memory on: high_memory off: + * FIXADDR_TOP FIXADDR_TOP + * fixed addresses fixed addresses + * FIXADDR_START FIXADDR_START + * temp fixed addresses/persistent kmap area VMALLOC_END + * PKMAP_BASE temp fixed addresses/vmalloc area + * VMALLOC_END VMALLOC_START + * vmalloc area high_memory + * VMALLOC_START + * high_memory + * + * The temp fixed area is only used during boot for early_ioremap(), and + * it is unused when the ioremap() is functional. vmalloc/pkmap area become + * available after early boot so the temp fixed area is available for re-use. */ #define LAST_PKMAP_MASK (LAST_PKMAP-1) #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index ef1c4d2d41ec..6c98be864a75 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -12,6 +12,7 @@ */ struct arch_hw_breakpoint { unsigned long address; + unsigned long mask; u8 len; u8 type; }; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 4615906d83df..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -94,30 +94,7 @@ extern void trace_call_function_single_interrupt(void); #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi #endif /* CONFIG_TRACING */ -/* IOAPIC */ -#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) -extern unsigned long io_apic_irqs; - -extern void setup_IO_APIC(void); -extern void disable_IO_APIC(void); - -struct io_apic_irq_attr { - int ioapic; - int ioapic_pin; - int trigger; - int polarity; -}; - -static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, - int ioapic, int ioapic_pin, - int trigger, int polarity) -{ - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; -} - +#ifdef CONFIG_IRQ_REMAP /* Intel specific interrupt remapping information */ struct irq_2_iommu { struct intel_iommu *iommu; @@ -131,14 +108,12 @@ struct irq_2_irte { u16 devid; /* Device ID for IRTE table */ u16 index; /* Index into IRTE table*/ }; +#endif /* CONFIG_IRQ_REMAP */ + +#ifdef CONFIG_X86_LOCAL_APIC +struct irq_data; -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ struct irq_cfg { - struct irq_pin_list *irq_2_pin; cpumask_var_t domain; cpumask_var_t old_domain; u8 vector; @@ -150,18 +125,39 @@ struct irq_cfg { struct irq_2_irte irq_2_irte; }; #endif + union { +#ifdef CONFIG_X86_IO_APIC + struct { + struct list_head irq_2_pin; + }; +#endif + }; }; +extern struct irq_cfg *irq_cfg(unsigned int irq); +extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data); +extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node); +extern void lock_vector_lock(void); +extern void unlock_vector_lock(void); extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); +extern void clear_irq_vector(int irq, struct irq_cfg *cfg); +extern void setup_vector_irq(int cpu); +#ifdef CONFIG_SMP extern void send_cleanup_vector(struct irq_cfg *); +extern void irq_complete_move(struct irq_cfg *cfg); +#else +static inline void send_cleanup_vector(struct irq_cfg *c) { } +static inline void irq_complete_move(struct irq_cfg *c) { } +#endif -struct irq_data; -int __ioapic_set_affinity(struct irq_data *, const struct cpumask *, - unsigned int *dest_id); -extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); -extern void setup_ioapic_dest(void); - -extern void enable_IO_APIC(void); +extern int apic_retrigger_irq(struct irq_data *data); +extern void apic_ack_edge(struct irq_data *data); +extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id); +#else /* CONFIG_X86_LOCAL_APIC */ +static inline void lock_vector_lock(void) {} +static inline void unlock_vector_lock(void) {} +#endif /* CONFIG_X86_LOCAL_APIC */ /* Statistics */ extern atomic_t irq_err_count; @@ -185,9 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif -extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); +extern char irq_entries_start[]; #ifdef CONFIG_TRACING -#define trace_interrupt interrupt +#define trace_irq_entries_start irq_entries_start #endif #define VECTOR_UNDEFINED (-1) @@ -195,17 +191,6 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); -extern void setup_vector_irq(int cpu); - -#ifdef CONFIG_X86_IO_APIC -extern void lock_vector_lock(void); -extern void unlock_vector_lock(void); -extern void __setup_vector_irq(int cpu); -#else -static inline void lock_vector_lock(void) {} -static inline void unlock_vector_lock(void) {} -static inline void __setup_vector_irq(int cpu) {} -#endif #endif /* !ASSEMBLY_ */ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ed8089d69094..6eb6fcb83f63 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -40,8 +40,8 @@ extern void __kernel_fpu_end(void); static inline void kernel_fpu_begin(void) { - WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); + WARN_ON_ONCE(!irq_fpu_usable()); __kernel_fpu_begin(); } @@ -51,6 +51,10 @@ static inline void kernel_fpu_end(void) preempt_enable(); } +/* Must be called with preempt disabled */ +extern void kernel_fpu_disable(void); +extern void kernel_fpu_enable(void); + /* * Some instructions like VIA's padlock instructions generate a spurious * DNA fault but don't modify SSE registers. And these instructions diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h new file mode 100644 index 000000000000..cd2ce4068441 --- /dev/null +++ b/arch/x86/include/asm/imr.h @@ -0,0 +1,60 @@ +/* + * imr.h: Isolated Memory Region API + * + * Copyright(c) 2013 Intel Corporation. + * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _IMR_H +#define _IMR_H + +#include <linux/types.h> + +/* + * IMR agent access mask bits + * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register + * definitions. + */ +#define IMR_ESRAM_FLUSH BIT(31) +#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */ +#define IMR_RMU BIT(29) +#define IMR_VC1_SAI_ID3 BIT(15) +#define IMR_VC1_SAI_ID2 BIT(14) +#define IMR_VC1_SAI_ID1 BIT(13) +#define IMR_VC1_SAI_ID0 BIT(12) +#define IMR_VC0_SAI_ID3 BIT(11) +#define IMR_VC0_SAI_ID2 BIT(10) +#define IMR_VC0_SAI_ID1 BIT(9) +#define IMR_VC0_SAI_ID0 BIT(8) +#define IMR_CPU_0 BIT(1) /* SMM mode */ +#define IMR_CPU BIT(0) /* Non SMM mode */ +#define IMR_ACCESS_NONE 0 + +/* + * Read/Write access-all bits here include some reserved bits + * These are the values firmware uses and are accepted by hardware. + * The kernel defines read/write access-all in the same way as firmware + * in order to have a consistent and crisp definition across firmware, + * bootloader and kernel. + */ +#define IMR_READ_ACCESS_ALL 0xBFFFFFFF +#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF + +/* Number of IMRs provided by Quark X1000 SoC */ +#define QUARK_X1000_IMR_MAX 0x08 +#define QUARK_X1000_IMR_REGBASE 0x40 + +/* IMR alignment bits - only bits 31:10 are checked for IMR validity */ +#define IMR_ALIGN 0x400 +#define IMR_MASK (IMR_ALIGN - 1) + +int imr_add_range(phys_addr_t base, size_t size, + unsigned int rmask, unsigned int wmask, bool lock); + +int imr_remove_range(phys_addr_t base, size_t size); + +#endif /* _IMR_H */ diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 48eb30a86062..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -65,10 +65,11 @@ struct insn { unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; -#define MAX_INSN_SIZE 16 +#define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) @@ -96,7 +97,7 @@ struct insn { #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ -extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); @@ -115,12 +116,13 @@ static inline void insn_get_attribute(struct insn *insn) extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ -static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +static inline void kernel_insn_init(struct insn *insn, + const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 - insn_init(insn, kaddr, 1); + insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ - insn_init(insn, kaddr, 0); + insn_init(insn, kaddr, buf_len, 0); #endif } diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index e34e097b6f9d..705d35708a50 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern struct console early_mrst_console; -extern void mrst_early_console_init(void); - extern struct console early_hsu_console; extern void hsu_early_console_init(const char *); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index b8237d8a1e0c..34a5b93704d3 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -74,6 +74,9 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_readw __readw #define __raw_readl __readl +#define writeb_relaxed(v, a) __writeb(v, a) +#define writew_relaxed(v, a) __writew(v, a) +#define writel_relaxed(v, a) __writel(v, a) #define __raw_writeb __writeb #define __raw_writew __writew #define __raw_writel __writel @@ -86,6 +89,7 @@ build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") #define readq_relaxed(a) readq(a) +#define writeq_relaxed(v, a) writeq(v, a) #define __raw_readq(a) readq(a) #define __raw_writeq(val, addr) writeq(val, addr) @@ -310,11 +314,11 @@ BUILDIO(b, b, char) BUILDIO(w, w, short) BUILDIO(l, , int) -extern void *xlate_dev_mem_ptr(unsigned long phys); -extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); +extern void *xlate_dev_mem_ptr(phys_addr_t phys); +extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); extern int ioremap_change_attr(unsigned long vaddr, unsigned long size, - unsigned long prot_val); + enum page_cache_mode pcm); extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); extern bool is_early_ioremap_ptep(pte_t *ptep); diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 1733ab49ac5e..2f91685fe1cd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -132,6 +132,10 @@ extern int noioapicquirk; /* -1 if "noapic" boot option passed */ extern int noioapicreroute; +extern unsigned long io_apic_irqs; + +#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs)) + /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. @@ -139,18 +143,15 @@ extern int noioapicreroute; #define io_apic_assign_pci_irqs \ (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) -struct io_apic_irq_attr; struct irq_cfg; extern void ioapic_insert_resources(void); +extern int arch_early_ioapic_init(void); extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *, unsigned int, int, struct io_apic_irq_attr *); extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg); -extern void native_compose_msi_msg(struct pci_dev *pdev, - unsigned int irq, unsigned int dest, - struct msi_msg *msg, u8 hpet_id); extern void native_eoi_ioapic_pin(int apic, int pin, int vector); extern int save_ioapic_entries(void); @@ -160,6 +161,13 @@ extern int restore_ioapic_entries(void); extern void setup_ioapic_ids_from_mpc(void); extern void setup_ioapic_ids_from_mpc_nocheck(void); +struct io_apic_irq_attr { + int ioapic; + int ioapic_pin; + int trigger; + int polarity; +}; + enum ioapic_domain_type { IOAPIC_DOMAIN_INVALID, IOAPIC_DOMAIN_LEGACY, @@ -188,8 +196,10 @@ extern int mp_find_ioapic_pin(int ioapic, u32 gsi); extern u32 mp_pin_to_gsi(int ioapic, int pin); extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags); extern void mp_unmap_irq(int irq); -extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base, - struct ioapic_domain_cfg *cfg); +extern int mp_register_ioapic(int id, u32 address, u32 gsi_base, + struct ioapic_domain_cfg *cfg); +extern int mp_unregister_ioapic(u32 gsi_base); +extern int mp_ioapic_registered(u32 gsi_base); extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq); extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq); @@ -227,19 +237,25 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned extern void io_apic_eoi(unsigned int apic, unsigned int vector); -extern bool mp_should_keep_irq(struct device *dev); - +extern void setup_IO_APIC(void); +extern void enable_IO_APIC(void); +extern void disable_IO_APIC(void); +extern void setup_ioapic_dest(void); +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin); +extern void print_IO_APICs(void); #else /* !CONFIG_X86_IO_APIC */ +#define IO_APIC_IRQ(x) 0 #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static inline void ioapic_insert_resources(void) { } +static inline int arch_early_ioapic_init(void) { return 0; } +static inline void print_IO_APICs(void) {} #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; } static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; } static inline void mp_unmap_irq(int irq) { } -static inline bool mp_should_keep_irq(struct device *dev) { return 1; } static inline int save_ioapic_entries(void) { @@ -262,8 +278,12 @@ static inline void disable_ioapic_support(void) { } #define native_io_apic_print_entries NULL #define native_ioapic_set_affinity NULL #define native_setup_ioapic_entry NULL -#define native_compose_msi_msg NULL #define native_eoi_ioapic_pin NULL + +static inline void setup_IO_APIC(void) { } +static inline void enable_IO_APIC(void) { } +static inline void setup_ioapic_dest(void) { } + #endif #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f42a04735a0a..e37d6b3ad983 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -79,11 +79,12 @@ struct iommu_table_entry { * d). Similar to the 'init', except that this gets called from pci_iommu_init * where we do have a memory allocator. * - * The standard vs the _FINISH differs in that the _FINISH variant will - * continue detecting other IOMMUs in the call list after the - * the detection routine returns a positive number. The _FINISH will - * stop the execution chain. Both will still call the 'init' and - * 'late_init' functions if they are set. + * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant + * in that the former will continue detecting other IOMMUs in the call + * list after the detection routine returns a positive number, while the + * latter will stop the execution chain upon first successful detection. + * Both variants will still call the 'init' and 'late_init' functions if + * they are set. */ #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index b7747c4c2cf2..6224d316c405 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,8 +33,6 @@ struct irq_cfg; #ifdef CONFIG_IRQ_REMAP -extern void setup_irq_remapping_ops(void); -extern int irq_remapping_supported(void); extern void set_irq_remapping_broken(void); extern int irq_remapping_prepare(void); extern int irq_remapping_enable(void); @@ -60,8 +58,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip); #else /* CONFIG_IRQ_REMAP */ -static inline void setup_irq_remapping_ops(void) { } -static inline int irq_remapping_supported(void) { return 0; } static inline void set_irq_remapping_broken(void) { } static inline int irq_remapping_prepare(void) { return -ENODEV; } static inline int irq_remapping_enable(void) { return -ENODEV; } diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 5702d7e3111d..666c89ec4bd7 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -126,6 +126,12 @@ #define NR_VECTORS 256 +#ifdef CONFIG_X86_LOCAL_APIC +#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR +#else +#define FIRST_SYSTEM_VECTOR NR_VECTORS +#endif + #define FPU_IRQ 13 #define FIRST_VM86_IRQ 3 diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) #define USERGS_SYSRET32 \ swapgs; \ sysretl -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - swapgs; \ - sti; \ - sysexit #else #define INTERRUPT_RETURN iret @@ -163,22 +159,27 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } +#endif /* !__ASSEMBLY__ */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; #else - -#ifdef CONFIG_X86_64 -#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_X86_64 +# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ - LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + call lockdep_sys_exit_thunk; \ cli; \ TRACE_IRQS_OFF; - -#else -#define ARCH_LOCKDEP_SYS_EXIT \ +# else +# define LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ pushl %edx; \ @@ -186,24 +187,12 @@ static inline int arch_irqs_disabled(void) popl %edx; \ popl %ecx; \ popl %eax; - -#define ARCH_LOCKDEP_SYS_EXIT_IRQ -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS -# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; -# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +# define LOCKDEP_SYS_EXIT_IRQ +# endif #else -# define TRACE_IRQS_ON -# define TRACE_IRQS_OFF -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ -# else # define LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ -# endif - +#endif #endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 6a2cefb4395a..a4c1cf7e93f8 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H -#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ #include <linux/stringify.h> #include <linux/types.h> @@ -30,8 +30,6 @@ l_yes: return true; } -#endif /* __KERNEL__ */ - #ifdef CONFIG_X86_64 typedef u64 jump_label_t; #else @@ -44,4 +42,5 @@ struct jump_entry { jump_label_t key; }; +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h new file mode 100644 index 000000000000..8b22422fbad8 --- /dev/null +++ b/arch/x86/include/asm/kasan.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_KASAN_H +#define _ASM_X86_KASAN_H + +/* + * Compiler uses shadow offset assuming that addresses start + * from 0. Kernel addresses don't start from 0, so shadow + * for kernel really starts from compiler's shadow offset + + * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT + */ +#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ + (0xffff800000000000ULL >> 3)) +/* 47 bits for kernel address -> (47 - 3) bits for shadow */ +#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) + +#ifndef __ASSEMBLY__ + +extern pte_t kasan_zero_pte[]; +extern pte_t kasan_zero_pmd[]; +extern pte_t kasan_zero_pud[]; + +#ifdef CONFIG_KASAN +void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_init(void); +#else +static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_init(void) { } +#endif + +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index eb181178fe0b..57a9d94fe160 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -208,6 +208,7 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); }; typedef u32 __attribute__((vector_size(16))) sse128_t; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6ed0c30d6a0c..dea2e7e962e3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -33,13 +33,11 @@ #define KVM_MAX_VCPUS 255 #define KVM_SOFT_MAX_VCPUS 160 -#define KVM_USER_MEM_SLOTS 125 +#define KVM_USER_MEM_SLOTS 509 /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_MMIO_SIZE 16 - #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 @@ -51,6 +49,7 @@ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL +#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ @@ -82,11 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } -#define SELECTOR_TI_MASK (1 << 2) -#define SELECTOR_RPL_MASK 0x03 - -#define IOPL_SHIFT 12 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 @@ -159,6 +153,18 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff +#define PFERR_PRESENT_BIT 0 +#define PFERR_WRITE_BIT 1 +#define PFERR_USER_BIT 2 +#define PFERR_RSVD_BIT 3 +#define PFERR_FETCH_BIT 4 + +#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) +#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) +#define PFERR_USER_MASK (1U << PFERR_USER_BIT) +#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) +#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -334,6 +340,7 @@ struct kvm_pmu { enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, + KVM_DEBUGREG_RELOAD = 4, }; struct kvm_vcpu_arch { @@ -361,6 +368,7 @@ struct kvm_vcpu_arch { int mp_state; u64 ia32_misc_enable_msr; bool tpr_access_reporting; + u64 ia32_xss; /* * Paging state of the vcpu @@ -419,6 +427,9 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; + + int maxphyaddr; + /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; @@ -538,11 +549,20 @@ struct kvm_arch_memory_slot { struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; +/* + * We use as the mode the number of bits allocated in the LDR for the + * logical processor ID. It happens that these are all powers of two. + * This makes it is very easy to detect cases where the APICs are + * configured for multiple modes; in that case, we cannot use the map and + * hence cannot use kvm_irq_delivery_to_apic_fast either. + */ +#define KVM_APIC_MODE_XAPIC_CLUSTER 4 +#define KVM_APIC_MODE_XAPIC_FLAT 8 +#define KVM_APIC_MODE_X2APIC 16 + struct kvm_apic_map { struct rcu_head rcu; - u8 ldr_bits; - /* fields bellow are used to decode ldr values in different modes */ - u32 cid_shift, cid_mask, lid_mask; + u8 mode; struct kvm_lapic *phys_map[256]; /* first index is cluster id second is cpu id in a cluster */ struct kvm_lapic *logical_map[16][16]; @@ -602,6 +622,9 @@ struct kvm_arch { struct kvm_xen_hvm_config xen_hvm_config; + /* reads protected by irq_srcu, writes by irq_lock */ + struct hlist_head mask_notifier_list; + /* fields used by HYPER-V emulation */ u64 hv_guest_os_id; u64 hv_hypercall; @@ -610,6 +633,8 @@ struct kvm_arch { #ifdef CONFIG_KVM_MMU_AUDIT int audit_point; #endif + + bool boot_vcpu_runs_old_kvmclock; }; struct kvm_vm_stat { @@ -638,6 +663,7 @@ struct kvm_vcpu_stat { u32 irq_window_exits; u32 nmi_window_exits; u32 halt_exits; + u32 halt_successful_poll; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -659,6 +685,16 @@ struct msr_data { u64 data; }; +struct kvm_lapic_irq { + u32 vector; + u32 delivery_mode; + u32 dest_mode; + u32 level; + u32 trig_mode; + u32 shorthand; + u32 dest_id; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -767,10 +803,36 @@ struct kvm_x86_ops { enum x86_intercept_stage stage); void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); + bool (*xsaves_supported)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); + + /* + * Arch-specific dirty logging hooks. These hooks are only supposed to + * be valid if the specific arch has hardware-accelerated dirty logging + * mechanism. Currently only for PML on VMX. + * + * - slot_enable_log_dirty: + * called when enabling log dirty mode for the slot. + * - slot_disable_log_dirty: + * called when disabling log dirty mode for the slot. + * also called when slot is created with log dirty disabled. + * - flush_log_dirty: + * called before reporting dirty_bitmap to userspace. + * - enable_log_dirty_pt_masked: + * called when reenabling log dirty for the GFNs in the mask after + * corresponding bits are cleared in slot->dirty_bitmap. + */ + void (*slot_enable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*slot_disable_log_dirty)(struct kvm *kvm, + struct kvm_memory_slot *slot); + void (*flush_log_dirty)(struct kvm *kvm); + void (*enable_log_dirty_pt_masked)(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t offset, unsigned long mask); }; struct kvm_arch_async_pf { @@ -803,10 +865,19 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); -void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_slot_set_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot); +void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, + struct kvm_memory_slot *slot, + gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); @@ -818,6 +889,19 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); +struct kvm_irq_mask_notifier { + void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); + int irq; + struct hlist_node link; +}; + +void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, + struct kvm_irq_mask_notifier *kimn); +void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, + bool mask); + extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); @@ -859,11 +943,12 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); +int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector); +void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); @@ -895,6 +980,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); +bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, int irq_source_id, int level) @@ -1053,7 +1139,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); @@ -1066,6 +1151,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index e62cf897f781..c1adf33fdd0d 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void) static inline bool kvm_para_available(void) { - return 0; + return false; } static inline unsigned int kvm_arch_para_features(void) diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 879fd7d33877..ef01fef3eebc 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -16,7 +16,6 @@ #define LHCALL_SET_PTE 14 #define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 -#define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 #define LHCALL_SEND_INTERRUPTS 19 diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h new file mode 100644 index 000000000000..a455a53d789a --- /dev/null +++ b/arch/x86/include/asm/livepatch.h @@ -0,0 +1,46 @@ +/* + * livepatch.h - x86-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> + * Copyright (C) 2014 SUSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ASM_X86_LIVEPATCH_H +#define _ASM_X86_LIVEPATCH_H + +#include <linux/module.h> +#include <linux/ftrace.h> + +#ifdef CONFIG_LIVEPATCH +static inline int klp_check_compiler_support(void) +{ +#ifndef CC_USING_FENTRY + return 1; +#endif + return 0; +} +extern int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); + +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} +#else +#error Live patching support is disabled; check CONFIG_LIVEPATCH +#endif + +#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 958b90f761e5..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -34,6 +34,10 @@ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ +/* AMD-specific bits */ +#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ +#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ + /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is * bits 15:0. But bit 12 is the 'F' bit, defined for corrected @@ -78,7 +82,6 @@ /* Software defined banks */ #define MCE_EXTENDED_BANK 128 #define MCE_THERMAL_BANK (MCE_EXTENDED_BANK + 0) -#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) #define MCE_LOG_LEN 32 #define MCE_LOG_SIGNATURE "MACHINECHECK" @@ -113,6 +116,12 @@ struct mca_config { u32 rip_msr; }; +struct mce_vendor_flags { + __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ + __reserved_0 : 63; +}; +extern struct mce_vendor_flags mce_flags; + extern struct mca_config mca_cfg; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -125,9 +134,11 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_vendor_init_severity(void) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -180,14 +191,13 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { - MCP_TIMESTAMP = (1 << 0), /* log time stamp */ - MCP_UC = (1 << 1), /* log uncorrected errors */ - MCP_DONTLOG = (1 << 2), /* only clear, don't log */ + MCP_TIMESTAMP = BIT(0), /* log time stamp */ + MCP_UC = BIT(1), /* log uncorrected errors */ + MCP_DONTLOG = BIT(2), /* only clear, don't log */ }; -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); -void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 64dc362506b7..2fb20d6f7e23 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -75,9 +75,83 @@ static inline void __exit exit_amd_microcode(void) {} #ifdef CONFIG_MICROCODE_EARLY #define MAX_UCODE_COUNT 128 + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly from CPUID. + */ +static inline int x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static inline unsigned int __x86_family(unsigned int sig) +{ + unsigned int x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static inline unsigned int x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + return __x86_family(eax); +} + +static inline unsigned int x86_model(unsigned int sig) +{ + unsigned int x86, model; + + x86 = __x86_family(sig); + + model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + model += ((sig >> 16) & 0xf) << 4; + + return model; +} + extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); extern int __init save_microcode_in_initrd(void); +void reload_early_microcode(void); #else static inline void __init load_ucode_bsp(void) {} static inline void load_ucode_ap(void) {} @@ -85,6 +159,7 @@ static inline int __init save_microcode_in_initrd(void) { return 0; } +static inline void reload_early_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index b7b10b82d3e5..af935397e053 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -59,7 +59,7 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, extern int __apply_microcode_amd(struct microcode_amd *mc_amd); extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); +extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); #define PATCH_MAX_SIZE PAGE_SIZE extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; @@ -68,10 +68,12 @@ extern u8 amd_ucode_patch[PATCH_MAX_SIZE]; extern void __init load_ucode_amd_bsp(void); extern void load_ucode_amd_ap(void); extern int __init save_microcode_in_initrd_amd(void); +void reload_ucode_amd(void); #else static inline void __init load_ucode_amd_bsp(void) {} static inline void load_ucode_amd_ap(void) {} static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +void reload_ucode_amd(void) {} #endif #endif /* _ASM_X86_MICROCODE_AMD_H */ diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index bbe296e0bce1..2b9209c46ca9 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -56,23 +56,28 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int -get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev); +extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc); extern int microcode_sanity_check(void *mc, int print_err); -extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev); -extern int -update_match_revision(struct microcode_header_intel *mc_header, int rev); +extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc); + +static inline int +revision_is_newer(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} #ifdef CONFIG_MICROCODE_INTEL_EARLY extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); extern void show_ucode_info_early(void); extern int __init save_microcode_in_initrd_intel(void); +void reload_ucode_intel(void); #else static inline __init void load_ucode_intel_bsp(void) {} static inline void load_ucode_intel_ap(void) {} static inline void show_ucode_info_early(void) {} static inline int __init save_microcode_in_initrd_intel(void) { return -EINVAL; } +static inline void reload_ucode_intel(void) {} #endif #if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 876e74e8eec7..09b9620a73b4 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -19,6 +19,8 @@ typedef struct { struct mutex lock; void __user *vdso; + + atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */ } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a8e865..883f6b933fa4 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -10,15 +10,29 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> +#include <asm/mpx.h> #ifndef CONFIG_PARAVIRT -#include <asm-generic/mm_hooks.h> - static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { } #endif /* !CONFIG_PARAVIRT */ +#ifdef CONFIG_PERF_EVENTS +extern struct static_key rdpmc_always_available; + +static inline void load_mm_cr4(struct mm_struct *mm) +{ + if (static_key_true(&rdpmc_always_available) || + atomic_read(&mm->context.perf_rdpmc_allowed)) + cr4_set_bits(X86_CR4_PCE); + else + cr4_clear_bits(X86_CR4_PCE); +} +#else +static inline void load_mm_cr4(struct mm_struct *mm) {} +#endif + /* * Used for LDT copy/destruction. */ @@ -53,7 +67,21 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); - /* Load the LDT, if the LDT is different: */ + /* Load per-mm CR4 state */ + load_mm_cr4(next); + + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never free an LDT while the mm still exists. That + * means that next->context.ldt != prev->context.ldt, + * because mms never share an LDT. + */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } @@ -77,6 +105,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); load_LDT_nolock(&next->context); } } @@ -102,4 +131,45 @@ do { \ } while (0) #endif +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + paravirt_arch_dup_mmap(oldmm, mm); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ + paravirt_arch_exit_mmap(mm); +} + +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + mpx_mm_init(mm); +} + +static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + /* + * mpx_notify_unmap() goes and reads a rarely-hot + * cacheline in the mm_struct. That can be expensive + * enough to be seen in profiles. + * + * The mpx_notify_unmap() call and its contents have been + * observed to affect munmap() performance on hardware + * where MPX is not present. + * + * The unlikely() optimizes for the fast case: no MPX + * in the CPU, or no MPX use in the process. Even if + * we get this wrong (in the unlikely event that MPX + * is widely enabled on some system) the overhead of + * MPX itself (reading bounds tables) is expected to + * overwhelm the overhead of getting this unlikely() + * consistently wrong. + */ + if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX))) + mpx_notify_unmap(mm, vma, start, end); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h new file mode 100644 index 000000000000..a952a13d59a7 --- /dev/null +++ b/arch/x86/include/asm/mpx.h @@ -0,0 +1,103 @@ +#ifndef _ASM_X86_MPX_H +#define _ASM_X86_MPX_H + +#include <linux/types.h> +#include <asm/ptrace.h> +#include <asm/insn.h> + +/* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ +#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) +#define MPX_BNDCFG_ENABLE_FLAG 0x1 +#define MPX_BD_ENTRY_VALID_FLAG 0x1 + +#ifdef CONFIG_X86_64 + +/* upper 28 bits [47:20] of the virtual address in 64-bit used to + * index into bounds directory (BD). + */ +#define MPX_BD_ENTRY_OFFSET 28 +#define MPX_BD_ENTRY_SHIFT 3 +/* bits [19:3] of the virtual address in 64-bit used to index into + * bounds table (BT). + */ +#define MPX_BT_ENTRY_OFFSET 17 +#define MPX_BT_ENTRY_SHIFT 5 +#define MPX_IGN_BITS 3 +#define MPX_BD_ENTRY_TAIL 3 + +#else + +#define MPX_BD_ENTRY_OFFSET 20 +#define MPX_BD_ENTRY_SHIFT 2 +#define MPX_BT_ENTRY_OFFSET 10 +#define MPX_BT_ENTRY_SHIFT 4 +#define MPX_IGN_BITS 2 +#define MPX_BD_ENTRY_TAIL 2 + +#endif + +#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) +#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) + +#define MPX_BNDSTA_TAIL 2 +#define MPX_BNDCFG_TAIL 12 +#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1)) +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1)) + +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1)) +#define MPX_BNDSTA_ERROR_CODE 0x3 + +#define MPX_BD_ENTRY_MASK ((1<<MPX_BD_ENTRY_OFFSET)-1) +#define MPX_BT_ENTRY_MASK ((1<<MPX_BT_ENTRY_OFFSET)-1) +#define MPX_GET_BD_ENTRY_OFFSET(addr) ((((addr)>>(MPX_BT_ENTRY_OFFSET+ \ + MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT) +#define MPX_GET_BT_ENTRY_OFFSET(addr) ((((addr)>>MPX_IGN_BITS) & \ + MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT) + +#ifdef CONFIG_X86_INTEL_MPX +siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf); +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf); +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ + /* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; +} +void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#else +static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, + struct xsave_struct *xsave_buf) +{ + return NULL; +} +static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + return -EINVAL; +} +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return 0; +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ +} +static inline void mpx_notify_unmap(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} +#endif /* CONFIG_X86_INTEL_MPX */ + +#endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a1410db38a1a..653dfa7662e1 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index f48b17df4224..3a52ee0e726d 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -20,7 +20,6 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f408caf73430..b3bebf9e5746 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -39,6 +39,8 @@ void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ -#define __HAVE_ARCH_GATE_AREA 1 +#ifdef CONFIG_X86_VSYSCALL_EMULATION +# define __HAVE_ARCH_GATE_AREA 1 +#endif #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 678205195ae1..4edd53b79a81 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -1,25 +1,30 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_SIZE_ORDER 2 +#ifdef CONFIG_KASAN +#define KASAN_STACK_ORDER 1 +#else +#define KASAN_STACK_ORDER 0 +#endif + +#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) -#define EXCEPTION_STACK_ORDER 0 +#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) -#define IRQ_STACK_ORDER 2 +#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e1610e29e..5f6051d5d139 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x) PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_cpu_ops.write_cr4, x); } @@ -330,13 +330,13 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm); } -static inline void arch_exit_mmap(struct mm_struct *mm) +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm); } @@ -976,15 +976,20 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) - -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop +#ifndef __ASSEMBLY__ +static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ +} + +static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) +{ +} +#endif /* __ASSEMBLY__ */ #endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index e2c1668dde7a..91bc4ba95f91 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -11,16 +11,17 @@ static const int pat_enabled; #endif extern void pat_init(void); +void pat_init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, - unsigned long req_type, unsigned long *ret_type); + enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, - unsigned long flag); + enum page_cache_mode pcm); int io_reserve_memtype(resource_size_t start, resource_size_t end, - unsigned long *type); + enum page_cache_mode *pcm); void io_free_memtype(resource_size_t start, resource_size_t end); diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 0892ea0e683f..4e370a5d8117 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -96,12 +96,15 @@ extern void pci_iommu_alloc(void); #ifdef CONFIG_PCI_MSI /* implemented in arch/x86/kernel/apic/io_apic. */ struct msi_desc; +void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq, + unsigned int dest, struct msi_msg *msg, u8 hpet_id); int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void native_teardown_msi_irq(unsigned int irq); void native_restore_msi_irqs(struct pci_dev *dev); int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, unsigned int irq_base, unsigned int irq_offset); #else +#define native_compose_msi_msg NULL #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index fa1195dae425..164e3f8d3c3d 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); +extern bool mp_should_keep_irq(struct device *dev); + struct pci_raw_ops { int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val); diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index fd472181a1d0..e0ba66ca68c6 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -64,7 +64,7 @@ #define __percpu_prefix "" #endif -#define __percpu_arg(x) __percpu_prefix "%P" #x +#define __percpu_arg(x) __percpu_prefix "%" #x /* * Initialized pointers to per-cpu variables needed for the boot @@ -179,29 +179,58 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var, constraint) \ +#define percpu_from_op(op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ - : constraint); \ + : "m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pfo_ret__; \ +}) + +#define percpu_stable_op(op, var) \ +({ \ + typeof(var) pfo_ret__; \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_arg(P1)",%0" \ + : "=q" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 2: \ + asm(op "w "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 4: \ + asm(op "l "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ + break; \ + case 8: \ + asm(op "q "__percpu_arg(P1)",%0" \ + : "=r" (pfo_ret__) \ + : "p" (&(var))); \ break; \ default: __bad_percpu_size(); \ } \ @@ -359,11 +388,11 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define this_cpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var))) +#define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) @@ -381,9 +410,9 @@ do { \ #define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) -#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) @@ -435,7 +464,7 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -444,7 +473,7 @@ do { \ #define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) +#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) #define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) #define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) #define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) @@ -522,7 +551,7 @@ static inline int x86_this_cpu_variable_test_bit(int nr, #include <asm-generic/percpu.h> /* We can use this directly for local CPU (faster). */ -DECLARE_PER_CPU(unsigned long, this_cpu_off); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8dfc9fd094a3..dc0f6ed35b08 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -177,6 +177,9 @@ struct x86_pmu_capability { #define IBS_CAPS_BRNTRGT (1U<<5) #define IBS_CAPS_OPCNTEXT (1U<<6) #define IBS_CAPS_RIPINVALIDCHK (1U<<7) +#define IBS_CAPS_OPBRNFUSE (1U<<8) +#define IBS_CAPS_FETCHCTLEXTD (1U<<9) +#define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 206a87fdd22d..fd74a11959de 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, - * split up the 29 bits of offset into this range. - */ -#define PTE_FILE_MAX_BITS 29 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) + - _PAGE_FILE, - }; -} - /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 81bb91b49a88..cdaa58c9b39e 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif -/* - * Bits 0, 6 and 7 are taken in the low part of the pte, - * put the 32 bits of offset into the high part. - * - * For soft-dirty tracking 11 bit is taken from - * the low part of pte as well. - */ -#define pte_to_pgoff(pte) ((pte).pte_high) -#define pgoff_to_pte(off) \ - ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) -#define PTE_FILE_MAX_BITS 32 - /* Encode and de-code a swap entry */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) #define __swp_type(x) (((x).val) & 0x1f) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index aa97a070f09f..a0c35bf6cb92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -9,9 +9,10 @@ /* * Macro to mark a page protection value as UC- */ -#define pgprot_noncached(prot) \ - ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | \ + cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ : (prot)) #ifndef __ASSEMBLY__ @@ -99,6 +100,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_dirty(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_DIRTY; +} + static inline int pmd_young(pmd_t pmd) { return pmd_flags(pmd) & _PAGE_ACCESSED; @@ -109,11 +115,6 @@ static inline int pte_write(pte_t pte) return pte_flags(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) -{ - return pte_flags(pte) & _PAGE_FILE; -} - static inline int pte_huge(pte_t pte) { return pte_flags(pte) & _PAGE_PSE; @@ -131,13 +132,7 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - /* - * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. - * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == - * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. - */ - return (pte_flags(pte) & _PAGE_SPECIAL) && - (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); + return pte_flags(pte) & _PAGE_SPECIAL; } static inline unsigned long pte_pfn(pte_t pte) @@ -299,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -323,21 +318,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_file_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_file_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SOFT_DIRTY); -} - -static inline int pte_file_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SOFT_DIRTY; -} - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ /* @@ -404,8 +384,8 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, - unsigned long flags, - unsigned long new_flags) + enum page_cache_mode pcm, + enum page_cache_mode new_pcm) { /* * PAT type is always WB for untracked ranges, so no need to check. @@ -419,10 +399,10 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, * - request is uncached, return cannot be write-back * - request is write-combine, return cannot be write-back */ - if ((flags == _PAGE_CACHE_UC_MINUS && - new_flags == _PAGE_CACHE_WB) || - (flags == _PAGE_CACHE_WC && - new_flags == _PAGE_CACHE_WB)) { + if ((pcm == _PAGE_CACHE_MODE_UC_MINUS && + new_pcm == _PAGE_CACHE_MODE_WB) || + (pcm == _PAGE_CACHE_MODE_WC && + new_pcm == _PAGE_CACHE_MODE_WB)) { return 0; } @@ -457,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t a) -{ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } @@ -473,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -493,10 +466,27 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be @@ -553,11 +543,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -876,19 +861,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #endif diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index ed5903be26fe..9fb2f2bc8245 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -37,7 +37,7 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ +#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ & PMD_MASK) #ifdef CONFIG_HIGHMEM diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4572b2f30237..2ee781114d34 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* PUD - Level3 access */ /* PMD - Level 2 access */ -#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ - _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT /* PTE - Level 1 access. */ @@ -145,13 +141,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) -#ifdef CONFIG_NUMA_BALANCING -/* Automatic NUMA balancing needs to be distinguishable from swap entries */ -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) -#else +#define SWP_TYPE_BITS 5 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 7166e25ecb57..602b6028c5b6 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -63,6 +63,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 07789647bf33..8c7c10802e9c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -4,7 +4,7 @@ #include <linux/const.h> #include <asm/page_types.h> -#define FIRST_USER_ADDRESS 0 +#define FIRST_USER_ADDRESS 0UL #define _PAGE_BIT_PRESENT 0 /* is present */ #define _PAGE_BIT_RW 1 /* writeable */ @@ -27,19 +27,9 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL -/* - set: nonlinear file mapping, saved PTE; unset:swap */ -#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) @@ -78,21 +68,6 @@ #endif /* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - -/* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 6 and 7 are *not* involved @@ -114,7 +89,6 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif -#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -125,14 +99,31 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + +/* + * The cache modes defined here are used to translate between pure SW usage + * and the HW defined cache mode bits and/or PAT entries. + * + * The resulting bits for PWT, PCD and PAT should be chosen in a way + * to have the WB mode at index 0 (all bits clear). This is the default + * right now and likely would break too much if changed. + */ +#ifndef __ASSEMBLY__ +enum page_cache_mode { + _PAGE_CACHE_MODE_WB = 0, + _PAGE_CACHE_MODE_WC = 1, + _PAGE_CACHE_MODE_UC_MINUS = 2, + _PAGE_CACHE_MODE_UC = 3, + _PAGE_CACHE_MODE_WT = 4, + _PAGE_CACHE_MODE_WP = 5, + _PAGE_CACHE_MODE_NUM = 8 +}; +#endif -#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) -#define _PAGE_CACHE_WB (0) -#define _PAGE_CACHE_WC (_PAGE_PWT) -#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) -#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) +#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) +#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ @@ -156,41 +147,27 @@ #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) -#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) -#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) -#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) #define __PAGE_KERNEL_IO (__PAGE_KERNEL) #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) -#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS) -#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC) #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) -#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) -#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) -#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) -#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) -#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) /* xwr */ #define __P000 PAGE_NONE @@ -324,22 +301,61 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -#ifdef CONFIG_NUMA_BALANCING -/* Set of bits that distinguishes present, prot_none and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) -static inline pteval_t ptenuma_flags(pte_t pte) +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; +extern uint8_t __pte2cachemode_tbl[8]; + +#define __pte2cm_idx(cb) \ + ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ + (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ + (((cb) >> _PAGE_BIT_PWT) & 1)) +#define __cm_idx2pte(i) \ + ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ + (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ + (((i) & 1) << _PAGE_BIT_PWT)) + +static inline unsigned long cachemode2protval(enum page_cache_mode pcm) { - return pte_flags(pte) & _PAGE_NUMA_MASK; + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; } - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { - return pmd_flags(pmd) & _PAGE_NUMA_MASK; + return __pgprot(cachemode2protval(pcm)); } -#endif /* CONFIG_NUMA_BALANCING */ +static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; -#define pgprot_val(x) ((x).pgprot) -#define __pgprot(x) ((pgprot_t) { (x) } ) + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} +static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} +static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) +{ + pgprot_t new; + unsigned long val; + + val = pgprot_val(pgprot); + pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT_LARGE) >> + (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); + return new; +} typedef struct page *pgtable_t; @@ -396,6 +412,7 @@ static inline void update_page_count(int level, unsigned long pages) { } extern pte_t *lookup_address(unsigned long address, unsigned int *level); extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level); +extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h index 0a4e140315b6..7249e6d0902d 100644 --- a/arch/x86/include/asm/platform_sst_audio.h +++ b/arch/x86/include/asm/platform_sst_audio.h @@ -16,6 +16,9 @@ #include <linux/sfi.h> +#define MAX_NUM_STREAMS_MRFLD 25 +#define MAX_NUM_STREAMS MAX_NUM_STREAMS_MRFLD + enum sst_audio_task_id_mrfld { SST_TASK_ID_NONE = 0, SST_TASK_ID_SBA = 1, @@ -73,6 +76,65 @@ struct sst_platform_data { unsigned int strm_map_size; }; +struct sst_info { + u32 iram_start; + u32 iram_end; + bool iram_use; + u32 dram_start; + u32 dram_end; + bool dram_use; + u32 imr_start; + u32 imr_end; + bool imr_use; + u32 mailbox_start; + bool use_elf; + bool lpe_viewpt_rqd; + unsigned int max_streams; + u32 dma_max_len; + u8 num_probes; +}; + +struct sst_lib_dnld_info { + unsigned int mod_base; + unsigned int mod_end; + unsigned int mod_table_offset; + unsigned int mod_table_size; + bool mod_ddr_dnld; +}; + +struct sst_res_info { + unsigned int shim_offset; + unsigned int shim_size; + unsigned int shim_phy_addr; + unsigned int ssp0_offset; + unsigned int ssp0_size; + unsigned int dma0_offset; + unsigned int dma0_size; + unsigned int dma1_offset; + unsigned int dma1_size; + unsigned int iram_offset; + unsigned int iram_size; + unsigned int dram_offset; + unsigned int dram_size; + unsigned int mbox_offset; + unsigned int mbox_size; + unsigned int acpi_lpe_res_index; + unsigned int acpi_ddr_index; + unsigned int acpi_ipc_irq_index; +}; + +struct sst_ipc_info { + int ipc_offset; + unsigned int mbox_recv_off; +}; + +struct sst_platform_info { + const struct sst_info *probe_data; + const struct sst_ipc_info *ipc_info; + const struct sst_res_info *res_info; + const struct sst_lib_dnld_info *lib_info; + const char *platform; +}; int add_sst_platform_device(void); #endif diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h index fc7a17c05d35..bc0fc0866553 100644 --- a/arch/x86/include/asm/pmc_atom.h +++ b/arch/x86/include/asm/pmc_atom.h @@ -53,6 +53,28 @@ /* Sleep state counter is in units of of 32us */ #define PMC_TMR_SHIFT 5 +/* Power status of power islands */ +#define PMC_PSS 0x98 + +#define PMC_PSS_BIT_GBE BIT(0) +#define PMC_PSS_BIT_SATA BIT(1) +#define PMC_PSS_BIT_HDA BIT(2) +#define PMC_PSS_BIT_SEC BIT(3) +#define PMC_PSS_BIT_PCIE BIT(4) +#define PMC_PSS_BIT_LPSS BIT(5) +#define PMC_PSS_BIT_LPE BIT(6) +#define PMC_PSS_BIT_DFX BIT(7) +#define PMC_PSS_BIT_USH_CTRL BIT(8) +#define PMC_PSS_BIT_USH_SUS BIT(9) +#define PMC_PSS_BIT_USH_VCCS BIT(10) +#define PMC_PSS_BIT_USH_VCCA BIT(11) +#define PMC_PSS_BIT_OTG_CTRL BIT(12) +#define PMC_PSS_BIT_OTG_VCCS BIT(13) +#define PMC_PSS_BIT_OTG_VCCA_CLK BIT(14) +#define PMC_PSS_BIT_OTG_VCCA BIT(15) +#define PMC_PSS_BIT_USB BIT(16) +#define PMC_PSS_BIT_USB_SUS BIT(17) + /* These registers reflect D3 status of functions */ #define PMC_D3_STS_0 0xA0 diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 400873450e33..8f3271842533 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,9 +30,6 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define task_preempt_count(p) \ - (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED) - #define init_task_preempt_count(p) do { \ task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ } while (0) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eb71ec794732..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -127,7 +127,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); +}; #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 @@ -151,7 +151,7 @@ extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS]; #ifdef CONFIG_SMP -DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #else #define cpu_info boot_cpu_data @@ -210,8 +210,23 @@ struct x86_hw_tss { unsigned long sp0; unsigned short ss0, __ss0h; unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so ss1 is a convenient scratch space in + * the same cacheline as sp0. We use ss1 to cache the value in + * MSR_IA32_SYSENTER_CS. When we context switch + * MSR_IA32_SYSENTER_CS, we first check if the new value being + * written matches ss1, and, if it's not, then we wrmsr the new + * value and update ss1. + * + * The only reason we context switch MSR_IA32_SYSENTER_CS is + * that we set it to zero in vm86 tasks to avoid corrupting the + * stack if we were to go through the sysenter path from vm86 + * mode. + */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; @@ -276,13 +291,17 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); + +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif /* * Save the original ist values for checking stack pointers during debugging @@ -374,13 +393,14 @@ struct lwp_struct { u8 reserved[128]; }; -struct bndregs_struct { - u64 bndregs[8]; +struct bndreg { + u64 lower_bound; + u64 upper_bound; } __packed; -struct bndcsr_struct { - u64 cfg_reg_u; - u64 status_reg; +struct bndcsr { + u64 bndcfgu; + u64 bndstatus; } __packed; struct xsave_hdr_struct { @@ -394,8 +414,8 @@ struct xsave_struct { struct xsave_hdr_struct xsave_hdr; struct ymmh_struct ymmh; struct lwp_struct lwp; - struct bndregs_struct bndregs; - struct bndcsr_struct bndcsr; + struct bndreg bndreg[4]; + struct bndcsr bndcsr; /* new processor state extensions will go here */ } __attribute__ ((packed, aligned (64))); @@ -473,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; @@ -563,6 +582,16 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long current_top_of_stack(void) +{ +#ifdef CONFIG_X86_64 + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -578,39 +607,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; -extern u32 *trampoline_cr4_features; - -static inline void set_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features |= mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 |= mask; - write_cr4(cr4); -} - -static inline void clear_in_cr4(unsigned long mask) -{ - unsigned long cr4; - - mmu_cr4_features &= ~mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4 = read_cr4(); - cr4 &= ~mask; - write_cr4(cr4); -} - typedef struct { unsigned long seg; } mm_segment_t; @@ -793,10 +789,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -807,10 +803,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -820,10 +815,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) @@ -831,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -841,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -882,18 +856,24 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else /* - * User space process size. 47bits minus one guard page. + * User space process size. 47bits minus one guard page. The guard + * page is necessary on Intel CPUs: if a SYSCALL instruction is at + * the highest possible canonical userspace address, then that + * syscall will enter the kernel with a non-canonical return + * address, and SYSRET will explode dangerously. We avoid this + * particular problem by preventing anything from being mapped + * at the maximum canonical address. */ #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) @@ -912,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* @@ -928,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, @@ -953,6 +924,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +/* Register/unregister a process' MPX related resource */ +#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) +#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) + +#ifdef CONFIG_X86_INTEL_MPX +extern int mpx_enable_management(struct task_struct *tsk); +extern int mpx_disable_management(struct task_struct *tsk); +#else +static inline int mpx_enable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +static inline int mpx_disable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +#endif /* CONFIG_X86_INTEL_MPX */ + extern u16 amd_get_nb_id(int cpu); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -31,13 +31,17 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -47,9 +51,12 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_ax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; @@ -89,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) } /* - * user_mode_vm(regs) determines whether a register set came from user mode. - * This is true if V8086 mode was enabled OR if the register set was from - * protected mode with RPL-3 CS value. This tricky test checks that with - * one comparison. Many places in the kernel can bypass this full check - * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + * user_mode(regs) determines whether a register set came from user + * mode. On x86_32, this is true if V8086 mode was enabled OR if the + * register set was from protected mode with RPL-3 CS value. This + * tricky test checks that with one comparison. + * + * On x86_64, vm86 mode is mercifully nonexistent, and we don't need + * the extra check. */ static inline int user_mode(struct pt_regs *regs) { @@ -104,16 +113,6 @@ static inline int user_mode(struct pt_regs *regs) #endif } -static inline int user_mode_vm(struct pt_regs *regs) -{ -#ifdef CONFIG_X86_32 - return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= - USER_RPL; -#else - return user_mode(regs); -#endif -} - static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 @@ -138,12 +137,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #endif } -#define current_user_stack_pointer() this_cpu_read(old_rsp) -/* ia32 vs. x32 difference */ -#define compat_user_stack_pointer() \ - (test_thread_flag(TIF_IA32) \ - ? current_pt_regs()->sp \ - : this_cpu_read(old_rsp)) +#define current_user_stack_pointer() current_pt_regs()->sp +#define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 @@ -248,7 +243,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, */ #define arch_ptrace_stop_needed(code, info) \ ({ \ - set_thread_flag(TIF_NOTIFY_RESUME); \ + force_iret(); \ false; \ }) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..25b1cc07d496 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; + u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 6f1c3a8a33ab..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -3,8 +3,10 @@ #include <linux/const.h> -/* Constructor for a conventional segment GDT (or LDT) entry */ -/* This is a macro so it can be used in initializers */ +/* + * Constructor for a conventional segment GDT (or LDT) entry. + * This is a macro so it can be used in initializers. + */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ @@ -12,210 +14,228 @@ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) -/* Simple and small GDT entries for booting only */ +/* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) +#define GDT_ENTRY_BOOT_DS 3 +#define GDT_ENTRY_BOOT_TSS 4 +#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) +#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) +/* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_RPL_MASK 0x3 -#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) -#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +/* User mode is privilege level 3: */ +#define USER_RPL 0x3 + +/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ +#define SEGMENT_TI_MASK 0x4 +/* LDT segment has TI set ... */ +#define SEGMENT_LDT 0x4 +/* ... GDT has it cleared */ +#define SEGMENT_GDT 0x0 + +#define GDT_ENTRY_INVALID_SEG 0 #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: * - * 0 - null + * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * - * 4 - unused <==== new cacheline + * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] - * 8 - TLS segment #3 + * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * - * 12 - kernel code segment <==== new cacheline + * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS - * 16 - TSS + * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support - * 20 - PNPBIOS support + * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support - * 24 - APM BIOS support + * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - stack_canary-20 [ for stack protector ] + * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ -#define GDT_ENTRY_TLS_MIN 6 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_KERNEL_CS 12 +#define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 - #define GDT_ENTRY_DEFAULT_USER_DS 15 +#define GDT_ENTRY_TSS 16 +#define GDT_ENTRY_LDT 17 +#define GDT_ENTRY_PNPBIOS_CS32 18 +#define GDT_ENTRY_PNPBIOS_CS16 19 +#define GDT_ENTRY_PNPBIOS_DS 20 +#define GDT_ENTRY_PNPBIOS_TS1 21 +#define GDT_ENTRY_PNPBIOS_TS2 22 +#define GDT_ENTRY_APMBIOS_BASE 23 + +#define GDT_ENTRY_ESPFIX_SS 26 +#define GDT_ENTRY_PERCPU 27 +#define GDT_ENTRY_STACK_CANARY 28 + +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 -#define GDT_ENTRY_KERNEL_BASE (12) +/* + * Number of entries in the GDT table: + */ +#define GDT_ENTRIES 32 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) +/* + * Segment selector values corresponding to the above entries: + */ -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) +/* segment for calling fn: */ +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) +/* code segment for BIOS: */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) +/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) +/* data segment for BIOS: */ +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) +/* transfer data segment: */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) +/* another data segment: */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) #ifdef CONFIG_SMP -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else -#define __KERNEL_PERCPU 0 +# define __KERNEL_PERCPU 0 #endif -#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) #ifdef CONFIG_CC_STACKPROTECTOR -#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) +# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) #else -#define __KERNEL_STACK_CANARY 0 +# define __KERNEL_STACK_CANARY 0 #endif -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 +#else /* 64-bit: */ -/* - * The GDT has 32 entries - */ -#define GDT_ENTRIES 32 - -/* The PnP BIOS entries in the GDT */ -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) - -/* The PnP BIOS selectors */ -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ - -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 +#include <asm/cache.h> -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 /* - * Matching rules for certain types of segments. + * We cannot use the same code segment descriptor for user and kernel mode, + * not even in long flat mode, because of different DPL. + * + * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes + * selectors: + * + * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, + * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * + * ss = STAR.SYSRET_CS+8 (in either case) + * + * thus USER_DS should be between 32-bit and 64-bit code selectors: */ +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) +/* Needs two entries */ +#define GDT_ENTRY_TSS 8 +/* Needs two entries */ +#define GDT_ENTRY_LDT 10 +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 -#else -#include <asm/cache.h> - -#define GDT_ENTRY_KERNEL32_CS 1 -#define GDT_ENTRY_KERNEL_CS 2 -#define GDT_ENTRY_KERNEL_DS 3 - -#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) +/* Abused to load per CPU data from limit */ +#define GDT_ENTRY_PER_CPU 15 /* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + * Number of entries in the GDT table: */ -#define GDT_ENTRY_DEFAULT_USER32_CS 4 -#define GDT_ENTRY_DEFAULT_USER_DS 5 -#define GDT_ENTRY_DEFAULT_USER_CS 6 -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) -#define __USER32_DS __USER_DS - -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 - -#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) +#define GDT_ENTRIES 16 -/* TLS indexes for 64bit - hardcoded in arch_prctl */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - -#define GDT_ENTRIES 16 +/* + * Segment selector values corresponding to the above entries: + * + * Note, selectors also need to have a correct RPL, + * expressed with the +3 value for user-space selectors: + */ +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER32_DS __USER_DS +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) + +/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ +#define FS_TLS 0 +#define GS_TLS 1 + +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #endif -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #ifndef CONFIG_PARAVIRT -#define get_kernel_rpl() 0 +# define get_kernel_rpl() 0 #endif -/* User mode is privilege level 3 */ -#define USER_RPL 0x3 -/* LDT segment has TI set, GDT has it cleared */ -#define SEGMENT_LDT 0x4 -#define SEGMENT_GDT 0x0 +#define IDT_ENTRIES 256 +#define NUM_EXCEPTION_VECTORS 32 -/* Bottom two bits of selector give the ring privilege level */ -#define SEGMENT_RPL_MASK 0x3 -/* Bit 2 is table indicator (LDT/GDT) */ -#define SEGMENT_TI_MASK 0x4 +/* Bitmask of exception vectors which push an error code on the stack: */ +#define EXCEPTION_ERRCODE_MASK 0x00027d00 -#define IDT_ENTRIES 256 -#define NUM_EXCEPTION_VECTORS 32 -/* Bitmask of exception vectors which push an error code on the stack */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 -#define GDT_SIZE (GDT_ENTRIES * 8) -#define GDT_ENTRY_TLS_ENTRIES 3 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) +#define GDT_SIZE (GDT_ENTRIES*8) +#define GDT_ENTRY_TLS_ENTRIES 3 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ + extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; #ifdef CONFIG_TRACING -#define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handlers early_idt_handlers #endif /* @@ -240,37 +260,30 @@ do { \ } while (0) /* - * Save a segment register away + * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") /* - * x86_32 user gs accessors. + * x86-32 user GS accessors: */ #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ +# ifdef CONFIG_X86_32_LAZY_GS +# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) +# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +# define task_user_gs(tsk) ((tsk)->thread.gs) +# define lazy_save_gs(v) savesegment(gs, (v)) +# define lazy_load_gs(v) loadsegment(gs, (v)) +# else /* X86_32_LAZY_GS */ +# define get_user_gs(regs) (u16)((regs)->gs) +# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +# define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +# define lazy_save_gs(v) do { } while (0) +# define lazy_load_gs(v) do { } while (0) +# endif /* X86_32_LAZY_GS */ #endif /* X86_32 */ -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } */ extern struct boot_params boot_params; +static inline bool kaslr_enabled(void) +{ + return !!(boot_params.hdr.loadflags & KASLR_FLAG); +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; + unsigned short __pad2; /* Was called gs, but was always zero. */ + unsigned short __pad1; /* Was called fs, but was always zero. */ + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,9 +13,7 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -27,23 +27,11 @@ #ifdef CONFIG_X86_SMAP -#define ASM_CLAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_CLAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection - -#define ASM_STAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_STAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection +#define ASM_CLAC \ + ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + +#define ASM_STAC \ + ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -61,20 +49,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..81d02fc7dafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); +void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h deleted file mode 100644 index 0da7409f0bec..000000000000 --- a/arch/x86/include/asm/smpboot_hooks.h +++ /dev/null @@ -1,68 +0,0 @@ -/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws - * which needs to alter them. */ - -static inline void smpboot_clear_io_apic_irqs(void) -{ -#ifdef CONFIG_X86_IO_APIC - io_apic_irqs = 0; -#endif -} - -static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) -{ - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0xa, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - local_flush_tlb(); - pr_debug("1.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = - start_eip >> 4; - pr_debug("2.\n"); - *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = - start_eip & 0xf; - pr_debug("3.\n"); -} - -static inline void smpboot_restore_warm_reset_vector(void) -{ - unsigned long flags; - - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - spin_lock_irqsave(&rtc_lock, flags); - CMOS_WRITE(0, 0xf); - spin_unlock_irqrestore(&rtc_lock, flags); - - *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; -} - -static inline void __init smpboot_setup_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else { - nr_ioapics = 0; - } -#endif -} - -static inline void smpboot_clear_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - nr_ioapics = 0; -#endif -} diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index e820c080a4e9..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include <asm/nops.h> + static inline void native_clts(void) { asm volatile("clts"); @@ -137,17 +139,17 @@ static inline void write_cr3(unsigned long x) native_write_cr3(x); } -static inline unsigned long read_cr4(void) +static inline unsigned long __read_cr4(void) { return native_read_cr4(); } -static inline unsigned long read_cr4_safe(void) +static inline unsigned long __read_cr4_safe(void) { return native_read_cr4_safe(); } -static inline void write_cr4(unsigned long x) +static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } @@ -199,6 +201,28 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void clwb(volatile void *__p) +{ + volatile struct { char x[64]; } *p = __p; + + asm volatile(ALTERNATIVE_2( + ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", + ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ + X86_FEATURE_CLFLUSHOPT, + ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ + X86_FEATURE_CLWB) + : [p] "+m" (*p) + : [pax] "a" (p)); +} + +static inline void pcommit_sfence(void) +{ + alternative(ASM_NOP7, + ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ + "sfence", + X86_FEATURE_PCOMMIT); +} + #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016485c9..cf87de3fc390 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key); static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) { - set_bit(0, (volatile unsigned long *)&lock->tickets.tail); + set_bit(0, (volatile unsigned long *)&lock->tickets.head); } #else /* !CONFIG_PARAVIRT_SPINLOCKS */ @@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock, } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ +static inline int __tickets_equal(__ticket_t one, __ticket_t two) +{ + return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); +} + +static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, + __ticket_t head) +{ + if (head & TICKET_SLOWPATH_FLAG) { + arch_spinlock_t old, new; + + old.tickets.head = head; + new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; + old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; + new.tickets.tail = old.tickets.tail; + + /* try to clear slowpath flag when there are no contenders */ + cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); + } +} static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { - return lock.tickets.head == lock.tickets.tail; + return __tickets_equal(lock.tickets.head, lock.tickets.tail); } /* @@ -87,91 +107,69 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock) if (likely(inc.head == inc.tail)) goto out; - inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { unsigned count = SPIN_THRESHOLD; do { - if (ACCESS_ONCE(lock->tickets.head) == inc.tail) - goto out; + inc.head = READ_ONCE(lock->tickets.head); + if (__tickets_equal(inc.head, inc.tail)) + goto clear_slowpath; cpu_relax(); } while (--count); __ticket_lock_spinning(lock, inc.tail); } -out: barrier(); /* make sure nothing creeps before the lock is taken */ +clear_slowpath: + __ticket_check_and_clear_slowpath(lock, inc.head); +out: + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; - old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) + old.tickets = READ_ONCE(lock->tickets); + if (!__tickets_equal(old.tickets.head, old.tickets.tail)) return 0; new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); + new.head_tail &= ~TICKET_SLOWPATH_FLAG; /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, - arch_spinlock_t old) -{ - arch_spinlock_t new; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - /* Perform the unlock on the "before" copy */ - old.tickets.head += TICKET_LOCK_INC; - - /* Clear the slowpath flag */ - new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); - - /* - * If the lock is uncontended, clear the flag - use cmpxchg in - * case it changes behind our back though. - */ - if (new.tickets.head != new.tickets.tail || - cmpxchg(&lock->head_tail, old.head_tail, - new.head_tail) != old.head_tail) { - /* - * Lock still has someone queued for it, so wake up an - * appropriate waiter. - */ - __ticket_unlock_kick(lock, old.tickets.head); - } -} - static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - arch_spinlock_t prev; + static_key_false(¶virt_ticketlocks_enabled)) { + __ticket_t head; - prev = *lock; - add_smp(&lock->tickets.head, TICKET_LOCK_INC); + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - /* add_smp() is a full mb() */ + head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) - __ticket_unlock_slowpath(lock, prev); + if (unlikely(head & TICKET_SLOWPATH_FLAG)) { + head &= ~TICKET_SLOWPATH_FLAG; + __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); + } } else __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return tmp.tail != tmp.head; + return !__tickets_equal(tmp.tail, tmp.head); } static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + struct __raw_tickets tmp = READ_ONCE(lock->tickets); - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; + tmp.head &= ~TICKET_SLOWPATH_FLAG; + return (tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended @@ -183,8 +181,20 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { - while (arch_spin_is_locked(lock)) + __ticket_t head = READ_ONCE(lock->tickets.head); + + for (;;) { + struct __raw_tickets tmp = READ_ONCE(lock->tickets); + /* + * We need to check "unlocked" in a loop, tmp.head == head + * can be false positive because of overflow. + */ + if (__tickets_equal(tmp.head, tmp.tail) || + !__tickets_equal(tmp.head, head)) + break; + cpu_relax(); + } } /* diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c468fc2c..e4661196994e 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t function. */ #define __HAVE_ARCH_MEMCPY 1 +extern void *__memcpy(void *to, const void *from, size_t len); + #ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 extern void *memcpy(void *to, const void *from, size_t len); #else -extern void *__memcpy(void *to, const void *from, size_t len); #define memcpy(dst, src, len) \ ({ \ size_t __len = (len); \ @@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET void *memset(void *s, int c, size_t n); +void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMMOVE void *memmove(void *dest, const void *src, size_t count); +void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); size_t strlen(const char *s); @@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) + +/* + * For files that not instrumented (e.g. mm/slub.c) we + * should use not instrumented version of mem* functions. + */ + +#undef memcpy +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b78ac3..751bf4b7bf11 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 854053889d4d..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -13,6 +13,33 @@ #include <asm/types.h> /* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + +/* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages @@ -31,7 +58,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - struct restart_block restart_block; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -45,9 +71,6 @@ struct thread_info { .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) @@ -75,7 +98,6 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ -#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -100,7 +122,6 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) -#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) @@ -140,8 +161,8 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY) + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ + _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ @@ -151,7 +172,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) -#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) /* * macros/functions for gaining access to the thread information structure @@ -164,24 +184,53 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); +} + +static inline unsigned long current_stack_pointer(void) +{ + unsigned long sp; +#ifdef CONFIG_X86_64 + asm("mov %%rsp,%0" : "=g" (sp)); +#else + asm("mov %%esp,%0" : "=g" (sp)); +#endif + return sp; } #else /* !__ASSEMBLY__ */ -/* how to get the thread information struct from ASM */ +/* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; + _ASM_SUB $(THREAD_SIZE),reg ; /* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). + * ASM operand which evaluates to a 'thread_info' address of + * the current task, if it is known that "reg" is exactly "off" + * bytes below the top of the stack currently. + * + * ( The kernel stack's size is known at build time, it is usually + * 2 or 4 pages, and the bottom of the kernel stack contains + * the thread_info structure. So to access the thread_info very + * quickly from assembly code we can calculate down from the + * top of the kernel stack to the bottom, using constant, + * build-time calculations only. ) + * + * For example, to fetch the current thread_info->flags value into %eax + * on x86-64 defconfig kernels, in syscall entry code where RSP is + * currently at exactly SIZEOF_PTREGS bytes away from the top of the + * stack: + * + * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax + * + * will translate to: + * + * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax + * + * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) +#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif @@ -231,6 +280,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 04905bfc508b..cd791948b286 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -15,6 +15,75 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +struct tlb_state { +#ifdef CONFIG_SMP + struct mm_struct *active_mm; + int state; +#endif + + /* + * Access to this CR4 shadow and to H/W CR4 is protected by + * disabling interrupts when modifying either one. + */ + unsigned long cr4; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +/* Initialize cr4 shadow for this CPU. */ +static inline void cr4_init_shadow(void) +{ + this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); +} + +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 | mask) != cr4) { + cr4 |= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } +} + +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + if ((cr4 & ~mask) != cr4) { + cr4 &= ~mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); + } +} + +/* Read the CR4 shadow. */ +static inline unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} + +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. This should only be used + * during boot on the boot cpu. + */ +extern unsigned long mmu_cr4_features; +extern u32 *trampoline_cr4_features; + +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static inline void __native_flush_tlb(void) { native_write_cr3(native_read_cr3()); @@ -24,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void) { unsigned long cr4; - cr4 = native_read_cr4(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); /* clear PGE */ native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ @@ -184,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -struct tlb_state { - struct mm_struct *active_mm; - int state; -}; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); - static inline void reset_lazy_tlbstate(void) { this_cpu_write(cpu_tlbstate.state, 0); diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bc8352e7010a..4e49d7dff78e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include <linux/context_tracking_state.h> #include <linux/kprobes.h> #include <asm/debugreg.h> @@ -39,6 +40,7 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); +#define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds #define trace_invalid_op invalid_op @@ -109,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif +extern enum ctx_state ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_begin_non_atomic(struct pt_regs *regs); +extern void ist_end_non_atomic(void); + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 0d592e0a5b84..ace9dec050b1 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) asm volatile("call __get_user_%P3" \ : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ - (x) = (__typeof__(*(ptr))) __val_gu; \ + (x) = (__force __typeof__(*(ptr))) __val_gu; \ __ret_gu; \ }) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 12a26b979bf1..f2f9b39b274a 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src, } unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest); +copy_user_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 2d60a7813dfe..fc808b83fccb 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -33,8 +33,8 @@ * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ -#define MAX_CPUS_PER_UVHUB 64 -#define MAX_CPUS_PER_SOCKET 32 +#define MAX_CPUS_PER_UVHUB 128 +#define MAX_CPUS_PER_SOCKET 64 #define ADP_SZ 64 /* hardware-provided max. */ #define UV_CPUS_PER_AS 32 /* hardware-provided max. */ #define ITEMS_PER_DESC 8 diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3c3366c2e37f..f556c4843aa1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -70,4 +70,25 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } +#ifdef CONFIG_X86_64 + +#define VGETCPU_CPU_MASK 0xfff + +static inline unsigned int __getcpu(void) +{ + unsigned int p; + + /* + * Load per CPU data from GDT. LSL is faster than RDTSCP and + * works on all CPUs. This is volatile so that it orders + * correctly wrt barrier() and to keep gcc from cleverly + * hoisting it out of the calling function. + */ + asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + + return p; +} + +#endif /* CONFIG_X86_64 */ + #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 5da71c27cc59..cce9ee68e335 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -19,6 +19,7 @@ #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/tlbflush.h> /* * VMX functions: @@ -40,12 +41,12 @@ static inline int cpu_has_vmx(void) static inline void cpu_vmxoff(void) { asm volatile (ASM_VMX_VMXOFF : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); + cr4_clear_bits(X86_CR4_VMXE); } static inline int cpu_vmx_enabled(void) { - return read_cr4() & X86_CR4_VMXE; + return __read_cr4() & X86_CR4_VMXE; } /** Disable VMX if it is enabled on the current CPU diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index bcbfade26d8d..da772edd19ab 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -69,6 +69,8 @@ #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_ENABLE_PML 0x00020000 +#define SECONDARY_EXEC_XSAVES 0x00100000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -120,6 +122,7 @@ enum vmcs_field { GUEST_LDTR_SELECTOR = 0x0000080c, GUEST_TR_SELECTOR = 0x0000080e, GUEST_INTR_STATUS = 0x00000810, + GUEST_PML_INDEX = 0x00000812, HOST_ES_SELECTOR = 0x00000c00, HOST_CS_SELECTOR = 0x00000c02, HOST_SS_SELECTOR = 0x00000c04, @@ -139,6 +142,8 @@ enum vmcs_field { VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009, VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a, VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b, + PML_ADDRESS = 0x0000200e, + PML_ADDRESS_HIGH = 0x0000200f, TSC_OFFSET = 0x00002010, TSC_OFFSET_HIGH = 0x00002011, VIRTUAL_APIC_PAGE_ADDR = 0x00002012, @@ -159,6 +164,8 @@ enum vmcs_field { EOI_EXIT_BITMAP3_HIGH = 0x00002023, VMREAD_BITMAP = 0x00002026, VMWRITE_BITMAP = 0x00002028, + XSS_EXIT_BITMAP = 0x0000202C, + XSS_EXIT_BITMAP_HIGH = 0x0000202D, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index 2a46ca720afc..6ba66ee79710 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -4,15 +4,7 @@ #include <linux/seqlock.h> #include <uapi/asm/vsyscall.h> -#define VGETCPU_RDTSCP 1 -#define VGETCPU_LSL 2 - -/* kernel space (writeable) */ -extern int vgetcpu_mode; -extern struct timezone sys_tz; - -#include <asm/vvar.h> - +#ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); /* @@ -20,25 +12,12 @@ extern void map_vsyscall(void); * Returns true if handled. */ extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); - -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) +#else +static inline void map_vsyscall(void) {} +static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { - unsigned int p; - - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - - return p; + return false; } -#endif /* CONFIG_X86_64 */ +#endif #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 5d2b9ad2c6d2..3f32dfc2ab73 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -44,8 +44,6 @@ extern char __vvar_page; /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(0, volatile unsigned long, jiffies) -DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e45e4da96bf1..f58a9c7a3c86 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -172,7 +172,6 @@ struct x86_platform_ops { struct pci_dev; struct msi_msg; -struct msi_desc; struct x86_msi_ops { int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); @@ -183,8 +182,6 @@ struct x86_msi_ops { void (*teardown_msi_irqs)(struct pci_dev *dev); void (*restore_msi_irqs)(struct pci_dev *dev); int (*setup_hpet_msi)(unsigned int irq, unsigned int id); - u32 (*msi_mask_irq)(struct msi_desc *desc, u32 mask, u32 flag); - u32 (*msix_mask_irq)(struct msi_desc *desc, u32 flag); }; struct IO_APIC_route_entry; diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h new file mode 100644 index 000000000000..0d809e9fc975 --- /dev/null +++ b/arch/x86/include/asm/xen/cpuid.h @@ -0,0 +1,91 @@ +/****************************************************************************** + * arch-x86/cpuid.h + * + * CPUID interface to Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2007 Citrix Systems, Inc. + * + * Authors: + * Keir Fraser <keir@xen.org> + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__ +#define __XEN_PUBLIC_ARCH_X86_CPUID_H__ + +/* + * For compatibility with other hypervisor interfaces, the Xen cpuid leaves + * can be found at the first otherwise unused 0x100 aligned boundary starting + * from 0x40000000. + * + * e.g If viridian extensions are enabled for an HVM domain, the Xen cpuid + * leaves will start at 0x40000100 + */ + +#define XEN_CPUID_FIRST_LEAF 0x40000000 +#define XEN_CPUID_LEAF(i) (XEN_CPUID_FIRST_LEAF + (i)) + +/* + * Leaf 1 (0x40000x00) + * EAX: Largest Xen-information leaf. All leaves up to an including @EAX + * are supported by the Xen host. + * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification + * of a Xen host. + */ +#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */ +#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */ +#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */ + +/* + * Leaf 2 (0x40000x01) + * EAX[31:16]: Xen major version. + * EAX[15: 0]: Xen minor version. + * EBX-EDX: Reserved (currently all zeroes). + */ + +/* + * Leaf 3 (0x40000x02) + * EAX: Number of hypercall transfer pages. This register is always guaranteed + * to specify one hypercall page. + * EBX: Base address of Xen-specific MSRs. + * ECX: Features 1. Unused bits are set to zero. + * EDX: Features 2. Unused bits are set to zero. + */ + +/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */ +#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0 +#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0) + +/* + * Leaf 5 (0x40000x04) + * HVM-specific features + */ + +/* EAX Features */ +/* Virtualized APIC registers */ +#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) +/* Virtualized x2APIC accesses */ +#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) +/* Memory mapped from other domains has valid IOMMU entries */ +#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2) + +#define XEN_CPUID_MAX_NUM_LEAVES 4 + +#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */ diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h index 7f02fe4e2c7b..acd844c017d3 100644 --- a/arch/x86/include/asm/xen/page-coherent.h +++ b/arch/x86/include/asm/xen/page-coherent.h @@ -22,8 +22,8 @@ static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) { } + dma_addr_t dev_addr, unsigned long offset, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) { } static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, size_t size, enum dma_data_direction dir, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c949923a5668..358dcd338915 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -41,10 +41,12 @@ typedef struct xpaddr { extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; +extern unsigned long *xen_p2m_addr; +extern unsigned long xen_p2m_size; +extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern unsigned long set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e); @@ -52,16 +54,50 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s, extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count); -extern int m2p_add_override(unsigned long mfn, struct page *page, - struct gnttab_map_grant_ref *kmap_op); extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_map_grant_ref *kmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, struct page **pages, unsigned int count); -extern int m2p_remove_override(struct page *page, - struct gnttab_map_grant_ref *kmap_op, - unsigned long mfn); -extern struct page *m2p_find_override(unsigned long mfn); -extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); + +/* + * Helper functions to write or read unsigned long values to/from + * memory, when the access may fault. + */ +static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) +{ + return __put_user(val, (unsigned long __user *)addr); +} + +static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) +{ + return __get_user(*val, (unsigned long __user *)addr); +} + +/* + * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): + * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator + * bits (identity or foreign) are set. + * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set + * identity or foreign indicator will be still set. __pfn_to_mfn() is + * encapsulating get_phys_to_machine() which is called in special cases only. + * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special + * cases needing an extended handling. + */ +static inline unsigned long __pfn_to_mfn(unsigned long pfn) +{ + unsigned long mfn; + + if (pfn < xen_p2m_size) + mfn = xen_p2m_addr[pfn]; + else if (unlikely(pfn < xen_max_p2m_pfn)) + return get_phys_to_machine(pfn); + else + return IDENTITY_FRAME(pfn); + + if (unlikely(mfn == INVALID_P2M_ENTRY)) + return get_phys_to_machine(pfn); + + return mfn; +} static inline unsigned long pfn_to_mfn(unsigned long pfn) { @@ -70,7 +106,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - mfn = get_phys_to_machine(pfn); + mfn = __pfn_to_mfn(pfn); if (mfn != INVALID_P2M_ENTRY) mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); @@ -83,7 +119,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return 1; - return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; + return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; } static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) @@ -102,7 +138,7 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn); if (ret < 0) return ~0; @@ -117,24 +153,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn_no_overrides(mfn); - if (get_phys_to_machine(pfn) != mfn) { - /* - * If this appears to be a foreign mfn (because the pfn - * doesn't map back to the mfn), then check the local override - * table to see if there's a better pfn to use. - * - * m2p_find_override_pfn returns ~0 if it doesn't find anything. - */ - pfn = m2p_find_override_pfn(mfn, ~0); - } + if (__pfn_to_mfn(pfn) != mfn) + pfn = ~0; /* - * pfn is ~0 if there are no entries in the m2p for mfn or if the - * entry doesn't map back to the mfn and m2p_override doesn't have a - * valid entry for it. + * pfn is ~0 if there are no entries in the m2p for mfn or the + * entry doesn't map back to the mfn. */ - if (pfn == ~0 && - get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) + if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) pfn = mfn; return pfn; @@ -180,7 +206,7 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) return mfn; pfn = mfn_to_pfn(mfn); - if (get_phys_to_machine(pfn) != mfn) + if (__pfn_to_mfn(pfn) != mfn) return -1; /* force !pfn_valid() */ return pfn; } @@ -236,4 +262,11 @@ void make_lowmem_page_readwrite(void *vaddr); #define xen_remap(cookie, size) ioremap((cookie), (size)); #define xen_unmap(cookie) iounmap((cookie)) +static inline bool xen_arch_need_swiotlb(struct device *dev, + unsigned long pfn, + unsigned long mfn) +{ + return false; +} + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 7e7a79ada658..c9a6d68b8d62 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -16,6 +16,7 @@ #define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) +#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) /* Bit 63 of XCR0 is reserved for future expansion */ #define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63))) @@ -81,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XSAVES"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XSAVE"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -111,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask) if (boot_cpu_has(X86_FEATURE_XSAVES)) asm volatile("1:"XRSTORS"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); else asm volatile("1:"XRSTOR"\n\t" "2:\n\t" - : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + xstate_fault + : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); - - asm volatile(xstate_fault - : "0" (0) - : "memory"); - return err; } @@ -148,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask) */ alternative_input_2( "1:"XSAVE, - "1:"XSAVEOPT, + XSAVEOPT, X86_FEATURE_XSAVEOPT, - "1:"XSAVES, + XSAVES, X86_FEATURE_XSAVES, [fx] "D" (fx), "a" (lmask), "d" (hmask) : "memory"); @@ -177,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask) */ alternative_input( "1: " XRSTOR, - "1: " XRSTORS, + XRSTORS, X86_FEATURE_XSAVES, "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) : "memory"); |