From 40d04110f87940b6a03bf0aa19cd29e84f465f20 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Nov 2019 19:03:03 +0100 Subject: x86, kcsan: Enable KCSAN for x86 This patch enables KCSAN for x86, with updates to build rules to not use KCSAN for several incompatible compilation units. Signed-off-by: Marco Elver Acked-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- arch/x86/kernel/Makefile | 4 ++++ arch/x86/kernel/cpu/Makefile | 3 +++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3578ad248bc9..a9a1cab437bc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,6 +28,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n +# With some compiler versions the generated code results in boot hangs, caused +# by several compilation units. To be safe, disable all instrumentation. +KCSAN_SANITIZE := n + OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..1f1b0edc0187 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,6 +13,9 @@ endif KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +# As above, instrumenting secondary CPU boot code causes boot hangs. +KCSAN_SANITIZE_common.o := n + # Make sure load_percpu_segment has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -- cgit From 8efbc518b884e1db2dd6a6fce62d0112ab871dcf Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 12 Feb 2020 19:04:24 +0800 Subject: x86/kexec: Do not reserve EFI setup_data in the kexec e820 table The e820 table for the kexec kernel unconditionally marks setup_data as reserved because the second kernel can reuse setup_data passed by the 1st kernel's boot loader, for example SETUP_PCI marked regions like PCI BIOS, etc. SETUP_EFI types, however, are used by kexec itself to enable EFI in the 2nd kernel. Thus, it is pointless to add this type of setup_data to the kexec e820 table as reserved. IOW, what happens is this: - 1st physical boot: no SETUP_EFI. - kexec loads a new kernel and prepares a SETUP_EFI setup_data blob, then reboots the machine. - 2nd kernel sees SETUP_EFI, reserves it both in the e820 and in the kexec e820 table. - If another kexec load is executed, it prepares a new SETUP_EFI blob and then reboots the machine into the new kernel. 5. The 3rd kexec-ed kernel has two SETUP_EFI ranges reserved. And so on... Thus skip SETUP_EFI while reserving setup_data in the e820_table_kexec table because it is not needed. [ bp: Heavily massage commit message, shorten line and improve comment. ] Signed-off-by: Dave Young Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200212110424.GA2938@dhcp-128-65.nay.redhat.com --- arch/x86/kernel/e820.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..c92029651b85 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -999,7 +999,15 @@ void __init e820__reserve_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + /* + * SETUP_EFI is supplied by kexec and does not need to be + * reserved. + */ + if (data->type != SETUP_EFI) + e820__range_update_kexec(pa_data, + sizeof(*data) + data->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); if (data->type == SETUP_INDIRECT && ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { -- cgit From 2fa9a3cf3055db07a4835eb7bd48c648cb17ac26 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 24 Mar 2020 19:58:36 +0100 Subject: x86/smpboot: Remove the last ICPU() macro Now all is using the shiny new macros. No code changed: # arch/x86/kernel/smpboot.o: text data bss dec hex filename 16432 2649 40 19121 4ab1 smpboot.o.before 16432 2649 40 19121 4ab1 smpboot.o.after md5: a58104003b72c1de533095bc5a4c30a9 smpboot.o.before.asm a58104003b72c1de533095bc5a4c30a9 smpboot.o.after.asm Signed-off-by: Borislav Petkov Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20200324185836.GI22931@zn.tnic --- arch/x86/kernel/smpboot.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe3ab9632f3b..3b9bf8c7e29d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1849,24 +1849,25 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) #include #include -#define ICPU(model) \ - {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} +#define X86_MATCH(model) \ + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_XEON_PHI_KNL), - ICPU(INTEL_FAM6_XEON_PHI_KNM), + X86_MATCH(XEON_PHI_KNL), + X86_MATCH(XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_SKYLAKE_X), + X86_MATCH(SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { - ICPU(INTEL_FAM6_ATOM_GOLDMONT), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), - ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + X86_MATCH(ATOM_GOLDMONT), + X86_MATCH(ATOM_GOLDMONT_D), + X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; -- cgit From 593309423cbad0fab659a685834416cf12d8f581 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 4 Apr 2020 01:33:05 +0200 Subject: x86/32: Remove CONFIG_DOUBLEFAULT Make the doublefault exception handler unconditional on 32-bit. Yes, it is important to be able to catch #DF exceptions instead of silent reboots. Yes, the code size increase is worth every byte. And one less CONFIG symbol is just the cherry on top. No functional changes. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200404083646.8897-1-bp@alien8.de --- arch/x86/kernel/Makefile | 4 +--- arch/x86/kernel/dumpstack_32.c | 4 ---- arch/x86/kernel/traps.c | 2 -- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..2a7c3afa62e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,9 +102,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -endif +obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 8e3a8fedfa4d..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); struct doublefault_stack *ss = &cea->doublefault_stack; @@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); return true; -#else - return false; -#endif } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..e85561fc0dc8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -326,7 +326,6 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * @@ -450,7 +449,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign die("double fault", regs, error_code); panic("Machine halted."); } -#endif dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { -- cgit From ada018b15ccecbdb95df46db7121516edb906bf6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Feb 2020 18:32:43 +0100 Subject: x86/mce/amd: Do proper cleanup on error paths Drop kobject reference counts properly on error in the banks and blocks allocation functions. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-2-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 52de616a8065..477cf773cf1c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1267,13 +1267,12 @@ recurse: if (b) kobject_uevent(&b->kobj, KOBJ_ADD); - return err; + return 0; out_free: if (b) { - kobject_put(&b->kobj); list_del(&b->miscj); - kfree(b); + kobject_put(&b->kobj); } return err; } @@ -1339,6 +1338,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } + /* Associate the bank with the per-CPU MCE device */ b->kobj = kobject_create_and_add(name, &dev->kobj); if (!b->kobj) { err = -EINVAL; @@ -1357,16 +1357,17 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank)); if (err) - goto out_free; + goto out_kobj; per_cpu(threshold_banks, cpu)[bank] = b; return 0; - out_free: +out_kobj: + kobject_put(b->kobj); +out_free: kfree(b); - - out: +out: return err; } -- cgit From c9bf318f77b3a78483e656e609d005c52aadc86d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 12 Feb 2020 00:34:01 +0100 Subject: x86/mce/amd: Init thresholding machinery only on relevant vendors ... and not unconditionally. [ bp: Add a new vendor_flags bit for that. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-3-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 12 ++++++++++-- arch/x86/kernel/cpu/mce/core.c | 1 + arch/x86/kernel/cpu/mce/internal.h | 9 ++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 477cf773cf1c..c3b3326ad4ac 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1442,15 +1442,20 @@ free_out: int mce_threshold_remove_device(unsigned int cpu) { + struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank; + if (!bp) + return 0; + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } - kfree(per_cpu(threshold_banks, cpu)); - per_cpu(threshold_banks, cpu) = NULL; + /* Clear the pointer before freeing the memory */ + this_cpu_write(threshold_banks, NULL); + kfree(bp); return 0; } @@ -1461,6 +1466,9 @@ int mce_threshold_create_device(unsigned int cpu) struct threshold_bank **bp; int err = 0; + if (!mce_flags.amd_threshold) + return 0; + bp = per_cpu(threshold_banks, cpu); if (bp) return 0; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..43ca91e14a77 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1756,6 +1756,7 @@ static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c) mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV); mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR); mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA); + mce_flags.amd_threshold = 1; if (mce_flags.smca) { msr_ops.ctl = smca_ctl_reg; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 3b008172ad73..74a01829c4f4 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -148,7 +148,7 @@ struct mce_vendor_flags { * Recovery. It indicates support for data poisoning in HW and deferred * error interrupts. */ - succor : 1, + succor : 1, /* * (AMD) SMCA: This bit indicates support for Scalable MCA which expands @@ -156,9 +156,12 @@ struct mce_vendor_flags { * banks. Also, to accommodate the new banks and registers, the MCA * register space is moved to a new MSR range. */ - smca : 1, + smca : 1, - __reserved_0 : 61; + /* AMD-style error thresholding banks present. */ + amd_threshold : 1, + + __reserved_0 : 60; }; extern struct mce_vendor_flags mce_flags; -- cgit From cca9cc05fe98f3eb0cfb58ec6739cfc9d0b4ccbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Mar 2020 20:05:43 +0100 Subject: x86/mce/amd: Protect a not-fully initialized bank from the thresholding interrupt Make sure the thresholding bank descriptor is fully initialized when the thresholding interrupt fires after a hotplug event. [ bp: Write commit message and document long-forgotten bank_map. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-4-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index c3b3326ad4ac..563942157758 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -192,7 +192,12 @@ EXPORT_SYMBOL_GPL(smca_banks); static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ + +/* + * A list of the banks enabled on each logical CPU. Controls which respective + * descriptors to initialize later in mce_threshold_create_device(). + */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* Map of banks that have more than MCA_MISC0 available. */ static DEFINE_PER_CPU(u32, smca_misc_banks_map); @@ -1016,13 +1021,22 @@ static void log_and_reset_block(struct threshold_block *block) static void amd_threshold_interrupt(void) { struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; + struct threshold_bank **bp = this_cpu_read(threshold_banks); unsigned int bank, cpu = smp_processor_id(); + /* + * Validate that the threshold bank has been initialized already. The + * handler is installed at boot time, but on a hotplug event the + * interrupt might fire before the data has been initialized. + */ + if (!bp) + return; + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; - first_block = per_cpu(threshold_banks, cpu)[bank]->blocks; + first_block = bp[bank]->blocks; if (!first_block) continue; @@ -1247,6 +1261,7 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb INIT_LIST_HEAD(&b->miscj); + /* This is safe as @tb is not visible yet */ if (tb->blocks) list_add(&b->miscj, &tb->blocks->miscj); else -- cgit From 6e7a41c63abcfee28734c4c8872dae8d642329b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 30 Mar 2020 16:21:54 +0200 Subject: x86/mce/amd: Sanitize thresholding device creation hotplug path Drop the stupid threshold_init_device() initcall iterating over all online CPUs in favor of properly setting up everything on the CPU hotplug path, when each CPU's callback is invoked. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-5-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 57 ++++++++++++------------------------------ arch/x86/kernel/cpu/mce/core.c | 11 ++++++++ 2 files changed, 27 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 563942157758..d3c416b6052a 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1474,12 +1474,22 @@ int mce_threshold_remove_device(unsigned int cpu) return 0; } -/* create dir/files for all valid threshold banks */ +/** + * mce_threshold_create_device - Create the per-CPU MCE threshold device + * @cpu: The plugged in CPU + * + * Create directories and files for all valid threshold banks. + * + * This is invoked from the CPU hotplug callback which was installed in + * mcheck_init_device(). The invocation happens in context of the hotplug + * thread running on @cpu. The callback is invoked on all CPUs which are + * online when the callback is installed or during a real hotplug event. + */ int mce_threshold_create_device(unsigned int cpu) { unsigned int bank; struct threshold_bank **bp; - int err = 0; + int err; if (!mce_flags.amd_threshold) return 0; @@ -1500,49 +1510,14 @@ int mce_threshold_create_device(unsigned int cpu) continue; err = threshold_create_bank(cpu, bank); if (err) - goto err; - } - return err; -err: - mce_threshold_remove_device(cpu); - return err; -} - -static __init int threshold_init_device(void) -{ - unsigned lcpu = 0; - - /* to hit CPUs online before the notifier is up */ - for_each_online_cpu(lcpu) { - int err = mce_threshold_create_device(lcpu); - - if (err) - return err; + goto out_err; } if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; return 0; +out_err: + mce_threshold_remove_device(cpu); + return err; } -/* - * there are 3 funcs which need to be _initcalled in a logic sequence: - * 1. xen_late_init_mcelog - * 2. mcheck_init_device - * 3. threshold_init_device - * - * xen_late_init_mcelog must register xen_mce_chrdev_device before - * native mce_chrdev_device registration if running under xen platform; - * - * mcheck_init_device should be inited before threshold_init_device to - * initialize mce_device, otherwise a NULL ptr dereference will cause panic. - * - * so we use following _initcalls - * 1. device_initcall(xen_late_init_mcelog); - * 2. device_initcall_sync(mcheck_init_device); - * 3. late_initcall(threshold_init_device); - * - * when running under xen, the initcall order is 1,2,3; - * on baremetal, we skip 1 and we do only 2 and 3. - */ -late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 43ca91e14a77..a6009efdfe2b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -2481,6 +2481,13 @@ static __init void mce_init_banks(void) } } +/* + * When running on XEN, this initcall is ordered against the XEN mcelog + * initcall: + * + * device_initcall(xen_late_init_mcelog); + * device_initcall_sync(mcheck_init_device); + */ static __init int mcheck_init_device(void) { int err; @@ -2512,6 +2519,10 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; + /* + * Invokes mce_cpu_online() on all CPUs which are online when + * the state is installed. + */ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", mce_cpu_online, mce_cpu_pre_down); if (err < 0) -- cgit From 6458de97fc15530b54477c4e2b70af653e8ac3d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 30 Mar 2020 20:30:45 +0200 Subject: x86/mce/amd: Straighten CPU hotplug path mce_threshold_create_device() hotplug callback runs on the plugged in CPU so: - use this_cpu_read() which is faster - pass in struct threshold_bank **bp to threshold_create_bank() and instead of doing per-CPU accesses - Use rdmsr_safe() instead of rdmsr_safe_on_cpu() which avoids an IPI. No functional changes. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-6-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index d3c416b6052a..a33d9a1caf36 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1223,10 +1223,10 @@ static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb u32 low, high; int err; - if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) + if ((bank >= this_cpu_read(mce_num_banks)) || (block >= NR_BLOCKS)) return 0; - if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + if (rdmsr_safe(address, &low, &high)) return 0; if (!(high & MASK_VALID_HI)) { @@ -1316,9 +1316,10 @@ static int __threshold_add_blocks(struct threshold_bank *b) return err; } -static int threshold_create_bank(unsigned int cpu, unsigned int bank) +static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, + unsigned int bank) { - struct device *dev = per_cpu(mce_device, cpu); + struct device *dev = this_cpu_read(mce_device); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(bank, NULL); @@ -1338,7 +1339,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - per_cpu(threshold_banks, cpu)[bank] = b; + bp[bank] = b; refcount_inc(&b->cpus); err = __threshold_add_blocks(b); @@ -1374,8 +1375,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_kobj; - per_cpu(threshold_banks, cpu)[bank] = b; - + bp[bank] = b; return 0; out_kobj: @@ -1487,35 +1487,33 @@ int mce_threshold_remove_device(unsigned int cpu) */ int mce_threshold_create_device(unsigned int cpu) { - unsigned int bank; + unsigned int numbanks, bank; struct threshold_bank **bp; int err; if (!mce_flags.amd_threshold) return 0; - bp = per_cpu(threshold_banks, cpu); + bp = this_cpu_read(threshold_banks); if (bp) return 0; - bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), - GFP_KERNEL); + numbanks = this_cpu_read(mce_num_banks); + bp = kcalloc(numbanks, sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) + for (bank = 0; bank < numbanks; ++bank) { + if (!(this_cpu_read(bank_map) & (1 << bank))) continue; - err = threshold_create_bank(cpu, bank); + err = threshold_create_bank(bp, cpu, bank); if (err) goto out_err; } + this_cpu_write(threshold_banks, bp); if (thresholding_irq_en) mce_threshold_vector = amd_threshold_interrupt; - return 0; out_err: mce_threshold_remove_device(cpu); -- cgit From f26d2580a7ddc84aa9e51e47fdbb5ad63dbee5a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Mar 2020 10:53:18 +0200 Subject: x86/mce/amd: Cleanup threshold device remove path Pass in the bank pointer directly to the cleaning up functions, obviating the need for per-CPU accesses. Make the clean up path interrupt-safe by cleaning the bank pointer first so that the rest of the teardown happens safe from the thresholding interrupt. No functional changes. [ bp: Write commit message and reverse bank->shared test to save an indentation level in threshold_remove_bank(). ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-7-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 79 ++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index a33d9a1caf36..16e7aea86ab1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -1362,6 +1362,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, } if (is_shared_bank(bank)) { + b->shared = 1; refcount_set(&b->cpus, 1); /* nb is already initialized, see above */ @@ -1391,21 +1392,16 @@ static void threshold_block_release(struct kobject *kobj) kfree(to_block(kobj)); } -static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) +static void deallocate_threshold_blocks(struct threshold_bank *bank) { - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; - - if (!head) - return; + struct threshold_block *pos, *tmp; - list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + list_for_each_entry_safe(pos, tmp, &bank->blocks->miscj, miscj) { list_del(&pos->miscj); kobject_put(&pos->kobj); } - kobject_put(&head->blocks->kobj); + kobject_put(&bank->blocks->kobj); } static void __threshold_remove_blocks(struct threshold_bank *b) @@ -1419,57 +1415,56 @@ static void __threshold_remove_blocks(struct threshold_bank *b) kobject_del(&pos->kobj); } -static void threshold_remove_bank(unsigned int cpu, int bank) +static void threshold_remove_bank(struct threshold_bank *bank) { struct amd_northbridge *nb; - struct threshold_bank *b; - b = per_cpu(threshold_banks, cpu)[bank]; - if (!b) - return; + if (!bank->blocks) + goto out_free; - if (!b->blocks) - goto free_out; + if (!bank->shared) + goto out_dealloc; - if (is_shared_bank(bank)) { - if (!refcount_dec_and_test(&b->cpus)) { - __threshold_remove_blocks(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; - return; - } else { - /* - * the last CPU on this node using the shared bank is - * going away, remove that bank now. - */ - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - nb->bank4 = NULL; - } + if (!refcount_dec_and_test(&bank->cpus)) { + __threshold_remove_blocks(bank); + return; + } else { + /* + * The last CPU on this node using the shared bank is going + * away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id())); + nb->bank4 = NULL; } - deallocate_threshold_block(cpu, bank); +out_dealloc: + deallocate_threshold_blocks(bank); -free_out: - kobject_del(b->kobj); - kobject_put(b->kobj); - kfree(b); - per_cpu(threshold_banks, cpu)[bank] = NULL; +out_free: + kobject_put(bank->kobj); + kfree(bank); } int mce_threshold_remove_device(unsigned int cpu) { struct threshold_bank **bp = this_cpu_read(threshold_banks); - unsigned int bank; + unsigned int bank, numbanks = this_cpu_read(mce_num_banks); if (!bp) return 0; - for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - threshold_remove_bank(cpu, bank); - } - /* Clear the pointer before freeing the memory */ + /* + * Clear the pointer before cleaning up, so that the interrupt won't + * touch anything of this. + */ this_cpu_write(threshold_banks, NULL); + + for (bank = 0; bank < numbanks; bank++) { + if (bp[bank]) { + threshold_remove_bank(bp[bank]); + bp[bank] = NULL; + } + } kfree(bp); return 0; } -- cgit From a037f3ca0ea0a660e3f961431095a88674b8f3c4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 31 Mar 2020 13:16:44 +0200 Subject: x86/mce/amd: Make threshold bank setting hotplug robust Handle the cases when the CPU goes offline before the bank setting/reading happens. [ bp: Write commit message. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200403161943.1458-8-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 16e7aea86ab1..15c87b87b901 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -386,6 +386,10 @@ static void threshold_restart_bank(void *_tr) struct thresh_restart *tr = _tr; u32 hi, lo; + /* sysfs write might race against an offline operation */ + if (this_cpu_read(threshold_banks)) + return; + rdmsr(tr->b->address, lo, hi); if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) @@ -1085,7 +1089,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) memset(&tr, 0, sizeof(tr)); tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1109,7 +1114,8 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) b->threshold_limit = new; tr.b = b; - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + if (smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1)) + return -ENODEV; return size; } @@ -1118,7 +1124,9 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 lo, hi; - rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); + /* CPU might be offline by now */ + if (rdmsr_on_cpu(b->cpu, b->address, &lo, &hi)) + return -ENODEV; return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - (THRESHOLD_MAX - b->threshold_limit))); -- cgit From 3e0fdec858d82c829774f271e88b5ceb17051551 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Apr 2020 09:55:10 +0200 Subject: x86/mce/amd, edac: Remove report_gart_errors ... because no one should be interested in spurious MCEs anyway. Make the filtering unconditional and move it to amd_filter_mce(). Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200407163414.18058-2-bp@alien8.de --- arch/x86/kernel/cpu/mce/amd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 15c87b87b901..ea3cf714b7ad 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -577,14 +577,19 @@ bool amd_filter_mce(struct mce *m) { enum smca_bank_types bank_type = smca_get_bank_type(m->bank); struct cpuinfo_x86 *c = &boot_cpu_data; - u8 xec = (m->status >> 16) & 0x3F; /* See Family 17h Models 10h-2Fh Erratum #1114. */ if (c->x86 == 0x17 && c->x86_model >= 0x10 && c->x86_model <= 0x2F && - bank_type == SMCA_IF && xec == 10) + bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10) return true; + /* NB GART TLB error reporting is disabled by default. */ + if (c->x86 < 0x17) { + if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5) + return true; + } + return false; } -- cgit From c9c6d216ed28be6e2c91e3651af169eca284813a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:14 -0800 Subject: x86/mce: Rename "first" function as "early" It isn't going to be first on the notifier chain when the CEC is moved to be a normal user of the notifier chain. Fix the enum for the MCE_PRIO symbols to list them in reverse order so that the compiler can give them numbers from low to high priority. Add an entry for MCE_PRIO_CEC as the highest priority. [ bp: Use passive voice, add comments. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-2-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a6009efdfe2b..43b1519ad4e5 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -559,7 +559,7 @@ static bool cec_add_mce(struct mce *m) return false; } -static int mce_first_notifier(struct notifier_block *nb, unsigned long val, +static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; @@ -580,9 +580,9 @@ static int mce_first_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; } -static struct notifier_block first_nb = { - .notifier_call = mce_first_notifier, - .priority = MCE_PRIO_FIRST, +static struct notifier_block early_nb = { + .notifier_call = mce_early_notifier, + .priority = MCE_PRIO_EARLY, }; static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, @@ -2041,7 +2041,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); - mce_register_decode_chain(&first_nb); + mce_register_decode_chain(&early_nb); mce_register_decode_chain(&mce_uc_nb); mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); -- cgit From 9554bfe403bdfc084823df8695a01f28c680af61 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:15 -0800 Subject: x86/mce: Convert the CEC to use the MCE notifier The CEC code has its claws in a couple of routines in mce/core.c. Convert it to just register itself on the normal MCE notifier chain. [ bp: Make cec_add_elem() and cec_init() static. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-3-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 43b1519ad4e5..b033b3589630 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -544,21 +544,6 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); -static bool cec_add_mce(struct mce *m) -{ - if (!m) - return false; - - /* We eat only correctable DRAM errors with usable addresses. */ - if (mce_is_memory_error(m) && - mce_is_correctable(m) && - mce_usable_address(m)) - if (!cec_add_elem(m->addr >> PAGE_SHIFT)) - return true; - - return false; -} - static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -567,9 +552,6 @@ static int mce_early_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (cec_add_mce(m)) - return NOTIFY_STOP; - /* Emit the trace record: */ trace_mce_record(m); @@ -2612,7 +2594,6 @@ static int __init mcheck_late_init(void) static_branch_inc(&mcsafe_key); mcheck_debugfs_init(); - cec_init(); /* * Flush out everything that has been logged during early boot, now that -- cgit From 23ba710a0864108910c7531dc4c73ef65eca5568 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:17 -0800 Subject: x86/mce: Fix all mce notifiers to update the mce->kflags bitmask If the handler took any action to log or deal with the error, set a bit in mce->kflags so that the default handler on the end of the machine check chain can see what has been done. Get rid of NOTIFY_STOP returns. Make the EDAC and dev-mcelog handlers skip over errors already processed by CEC. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-5-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 4 +++- arch/x86/kernel/cpu/mce/dev-mcelog.c | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b033b3589630..5666a48a4bc9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -581,8 +581,10 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, return NOTIFY_DONE; pfn = mce->addr >> PAGE_SHIFT; - if (!memory_failure(pfn, 0)) + if (!memory_failure(pfn, 0)) { set_mce_nospec(pfn); + mce->kflags |= MCE_HANDLED_UC; + } return NOTIFY_OK; } diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index d089567a9ce8..c033e7ea9e3c 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -39,6 +39,9 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, struct mce *mce = (struct mce *)data; unsigned int entry; + if (mce->kflags & MCE_HANDLED_CEC) + return NOTIFY_DONE; + mutex_lock(&mce_chrdev_read_mutex); entry = mcelog->next; @@ -56,6 +59,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, memcpy(mcelog->entry + entry, mce, sizeof(struct mce)); mcelog->entry[entry].finished = 1; + mcelog->entry[entry].kflags = 0; /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); @@ -63,6 +67,7 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val, unlock: mutex_unlock(&mce_chrdev_read_mutex); + mce->kflags |= MCE_HANDLED_MCELOG; return NOTIFY_OK; } -- cgit From 925946cfa715a5a71639528f82b98e58f14dd4cb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:18 -0800 Subject: x86/mce: Change default MCE logger to check mce->kflags Instead of keeping count of how many handlers are registered on the MCE notifier chain and printing if below some magic value, look at mce->kflags to see if anyone claims to have handled/logged this error. [ bp: Do not print ->kflags in __print_mce(). ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-6-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5666a48a4bc9..fc879b6669d5 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -158,29 +158,17 @@ void mce_log(struct mce *m) } EXPORT_SYMBOL_GPL(mce_log); -/* - * We run the default notifier if we have only the UC, the first and the - * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS - * notifiers registered on the chain. - */ -#define NUM_DEFAULT_NOTIFIERS 3 -static atomic_t num_notifiers; - void mce_register_decode_chain(struct notifier_block *nb) { if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) return; - atomic_inc(&num_notifiers); - blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { - atomic_dec(&num_notifiers); - blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -263,6 +251,7 @@ static void __print_mce(struct mce *m) } pr_cont("\n"); + /* * Note this output is parsed by external tools and old fields * should not be changed. @@ -602,10 +591,8 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS) - return NOTIFY_DONE; - - __print_mce(m); + if (!m->kflags) + __print_mce(m); return NOTIFY_DONE; } -- cgit From 43505646941bee217b91d064756975aa1ab6ee3b Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 14 Feb 2020 14:27:19 -0800 Subject: x86/mce: Add mce=print_all option Sometimes, when logs are getting lost, it's nice to just have everything dumped to the serial console. Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200214222720.13168-7-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/core.c | 7 ++++++- arch/x86/kernel/cpu/mce/internal.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index fc879b6669d5..4efe6c128887 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -591,7 +591,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val, if (!m) return NOTIFY_DONE; - if (!m->kflags) + if (mca_cfg.print_all || !m->kflags) __print_mce(m); return NOTIFY_DONE; @@ -1962,6 +1962,7 @@ void mce_disable_bank(int bank) * mce=no_cmci Disables CMCI * mce=no_lmce Disables LMCE * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=print_all Print all machine check logs to console * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) * monarchtimeout is how long to wait for other CPUs on machine @@ -1990,6 +1991,8 @@ static int __init mcheck_enable(char *str) cfg->lmce_disabled = 1; else if (!strcmp(str, "dont_log_ce")) cfg->dont_log_ce = true; + else if (!strcmp(str, "print_all")) + cfg->print_all = true; else if (!strcmp(str, "ignore_ce")) cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) @@ -2256,6 +2259,7 @@ static ssize_t store_int_with_restart(struct device *s, static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); +static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -2280,6 +2284,7 @@ static struct device_attribute *mce_device_attrs[] = { #endif &dev_attr_monarch_timeout.attr, &dev_attr_dont_log_ce.attr, + &dev_attr_print_all.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, NULL diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 74a01829c4f4..55f5c7b755f2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -119,6 +119,7 @@ struct mca_config { bool dont_log_ce; bool cmci_disabled; bool ignore_ce; + bool print_all; __u64 lmce_disabled : 1, disabled : 1, -- cgit From 1df73b2131e3b33d518609769636b41ce00212de Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 7 Apr 2020 13:49:58 +0200 Subject: x86/mce: Fixup exception only for the correct MCEs The severity grading code returns IN_KERNEL_RECOV error context for errors which have happened in kernel space but from which the kernel can recover. Whether the recovery can happen is determined by the exception table entry having as handler ex_handler_fault() and which has been declared at build time using _ASM_EXTABLE_FAULT(). IN_KERNEL_RECOV is used in mce_severity_intel() to lookup the corresponding error severity in the severities table. However, the mapping back from error severity to whether the error is IN_KERNEL_RECOV is ambiguous and in the very paranoid case - which might not be possible right now - but be better safe than sorry later, an exception fixup could be attempted for another MCE whose address is in the exception table and has the proper severity. Which would be unfortunate, to say the least. Therefore, mark such MCEs explicitly as MCE_IN_KERNEL_RECOV so that the recovery attempt is done only for them. Document the whole handling, while at it, as it is not trivial. Reported-by: Thomas Gleixner Signed-off-by: Borislav Petkov Tested-by: Tony Luck Link: https://lkml.kernel.org/r/20200407163414.18058-10-bp@alien8.de --- arch/x86/kernel/cpu/mce/core.c | 15 +++++++++++++-- arch/x86/kernel/cpu/mce/severity.c | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4efe6c128887..02e1f165f148 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1331,8 +1331,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) local_irq_disable(); ist_end_non_atomic(); } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + /* + * Handle an MCE which has happened in kernel space but from + * which the kernel can recover: ex_has_fault_handler() has + * already verified that the rIP at which the error happened is + * a rIP from which the kernel can recover (by jumping to + * recovery code specified in _ASM_EXTABLE_FAULT()) and the + * corresponding exception handler which would do that is the + * proper one. + */ + if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + mce_panic("Failed kernel mode recovery", &m, msg); + } } out_ist: diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 87bcdc6dc2f0..e1da619add19 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -213,8 +213,12 @@ static int error_context(struct mce *m) { if ((m->cs & 3) == 3) return IN_USER; - if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) + + if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) { + m->kflags |= MCE_IN_KERNEL_RECOV; return IN_KERNEL_RECOV; + } + return IN_KERNEL; } -- cgit From 4c5b566c2193e2af82c891daa5303c8899e61044 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 31 Mar 2020 02:15:44 +0800 Subject: crash_dump: Remove no longer used saved_max_pfn saved_max_pfn was originally introduced in commit 92aa63a5a1bf ("[PATCH] kdump: Retrieve saved max pfn") It used to make sure that the user does not try to read the physical memory beyond saved_max_pfn. But since commit 921d58c0e699 ("vmcore: remove saved_max_pfn check") it's no longer used for the check. This variable doesn't have any users anymore so just remove it. [ bp: Drop the Calgary IOMMU reference from the commit message. ] Signed-off-by: Kairui Song Signed-off-by: Borislav Petkov Acked-by: "Eric W. Biederman" Link: https://lkml.kernel.org/r/20200330181544.1595733-1-kasong@redhat.com --- arch/x86/kernel/e820.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index c5399e80c59c..4d13c57f370a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p) return -EINVAL; if (!strncmp(p, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* - * If we are doing a crash dump, we still need to know - * the real memory size before the original memory map is - * reset. - */ - saved_max_pfn = e820__end_of_ram_pfn(); -#endif e820_table->nr_entries = 0; userdef = 1; return 0; -- cgit From 968e6147fcc5862096863980298f3ec4ae5742eb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 26 Mar 2020 19:54:15 +0200 Subject: x86/early_printk: Remove unused includes After 1bd187de5364 ("x86, intel-mid: remove Intel MID specific serial support") the Intel MID header is not needed anymore. After 69c1f396f25b ("efi/x86: Convert x86 EFI earlyprintk into generic earlycon implementation") the EFI headers are not needed anymore. Remove the respective includes. Signed-off-by: Andy Shevchenko Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200326175415.8618-1-andriy.shevchenko@linux.intel.com --- arch/x86/kernel/early_printk.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b33904251a9..93fbdff2974f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -15,12 +15,9 @@ #include #include #include -#include #include #include #include -#include -#include #include /* Simple VGA output */ -- cgit From 93920f61c2ad7edb01e63323832585796af75fc9 Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:32:42 +0200 Subject: x86/cpu: Add 'table' argument to cpu_matches() To make cpu_matches() reusable for other matching tables, have it take a pointer to a x86_cpu_id table as an argument. [ bp: Flip arguments order. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf --- arch/x86/kernel/cpu/common.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..1131ae032bf2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,9 +1075,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; -static bool __init cpu_matches(unsigned long which) +static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { - const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); + const struct x86_cpu_id *m = x86_match_cpu(table); return m && !!(m->driver_data & which); } @@ -1097,31 +1097,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) u64 ia32_cap = x86_read_arch_cap_msr(); /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ - if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && + !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); - if (cpu_matches(NO_SPECULATION)) + if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && + if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && + !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); if (ia32_cap & ARCH_CAP_IBRS_ALL) setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); - if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { + if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && + !(ia32_cap & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); - if (cpu_matches(MSBDS_ONLY)) + if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); } - if (!cpu_matches(NO_SWAPGS)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); /* @@ -1139,7 +1142,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); - if (cpu_matches(NO_MELTDOWN)) + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ @@ -1148,7 +1151,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - if (cpu_matches(NO_L1TF)) + if (cpu_matches(cpu_vuln_whitelist, NO_L1TF)) return; setup_force_cpu_bug(X86_BUG_L1TF); -- cgit From e9d7144597b10ff13ff2264c059f7d4a7fbc89ac Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:23:10 +0200 Subject: x86/cpu: Add a steppings field to struct x86_cpu_id Intel uses the same family/model for several CPUs. Sometimes the stepping must be checked to tell them apart. On x86 there can be at most 16 steppings. Add a steppings bitmask to x86_cpu_id and a X86_MATCH_VENDOR_FAMILY_MODEL_STEPPING_FEATURE macro and support for matching against family/model/stepping. [ bp: Massage. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf --- arch/x86/kernel/cpu/match.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; -- cgit From 7e5b3c267d256822407a22fdce6afdf9cd13f9fb Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:54:04 +0200 Subject: x86/speculation: Add Special Register Buffer Data Sampling (SRBDS) mitigation SRBDS is an MDS-like speculative side channel that can leak bits from the random number generator (RNG) across cores and threads. New microcode serializes the processor access during the execution of RDRAND and RDSEED. This ensures that the shared buffer is overwritten before it is released for reuse. While it is present on all affected CPU models, the microcode mitigation is not needed on models that enumerate ARCH_CAPABILITIES[MDS_NO] in the cases where TSX is not supported or has been disabled with TSX_CTRL. The mitigation is activated by default on affected processors and it increases latency for RDRAND and RDSEED instructions. Among other effects this will reduce throughput from /dev/urandom. * Enable administrator to configure the mitigation off when desired using either mitigations=off or srbds=off. * Export vulnerability status via sysfs * Rename file-scoped macros to apply for non-whitelist table initializations. [ bp: Massage, - s/VULNBL_INTEL_STEPPING/VULNBL_INTEL_STEPPINGS/g, - do not read arch cap MSR a second time in tsx_fused_off() - just pass it in, - flip check in cpu_set_bug_bits() to save an indentation level, - reflow comments. jpoimboe: s/Mitigated/Mitigation/ in user-visible strings tglx: Dropped the fused off magic for now ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Pawan Gupta Reviewed-by: Josh Poimboeuf Tested-by: Neelima Krishnan --- arch/x86/kernel/cpu/bugs.c | 106 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 31 +++++++++++++ arch/x86/kernel/cpu/cpu.h | 1 + 3 files changed, 138 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..56978cb06149 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -397,6 +399,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) } early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -1528,6 +1621,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1131ae032bf2..8293ee514975 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { const struct x86_cpu_id *m = x86_match_cpu(table); @@ -1142,6 +1163,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1594,6 +1624,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..fb538fccd24c 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); -- cgit From 0298739b7983cf9bf4fcfb4bfb815c539bdb87ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 16:53:19 +0200 Subject: x86,ftrace: Fix ftrace_regs_caller() unwind The ftrace_regs_caller() trampoline does something 'funny' when there is a direct-caller present. In that case it stuffs the 'direct-caller' address on the return stack and then exits the function. This then results in 'returning' to the direct-caller with the exact registers we came in with -- an indirect tail-call without using a register. This however (rightfully) confuses objtool because the function shares a few instruction in order to have a single exit path, but the stack layout is different for them, depending through which path we came there. This is currently cludged by forcing the stack state to the non-direct case, but this generates actively wrong (ORC) unwind information for the direct case, leading to potential broken unwinds. Fix this issue by fully separating the exit paths. This results in having to poke a second RET into the trampoline copy, see ftrace_regs_caller_ret. This brings us to a second objtool problem, in order for it to perceive the 'jmp ftrace_epilogue' as a function exit, it needs to be recognised as a tail call. In order to make that happen, ftrace_epilogue needs to be the start of an STT_FUNC, so re-arrange code to make this so. Finally, a third issue is that objtool requires functions to exit with the same stack layout they started with, which is obviously violated in the direct case, employ the new HINT_RET_OFFSET to tell objtool this is an expected exception. Together, this results in generating correct ORC unwind information for the ftrace_regs_caller() function and it's trampoline copies. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.749606694@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 12 ++++++++++-- arch/x86/kernel/ftrace_64.S | 32 +++++++++++++++----------------- 2 files changed, 25 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 37a0aeaf89e7..867c126ddabe 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -282,7 +282,8 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_epilogue(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -334,7 +335,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; } @@ -366,6 +367,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); + ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + if (WARN_ON(ret < 0)) + goto fail; + } + /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 369e61faacfe..7657dc782828 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -157,8 +157,12 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * think twice before adding any new code or changing the * layout here. */ -SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + jmp ftrace_epilogue +SYM_FUNC_END(ftrace_caller); + +SYM_FUNC_START(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub @@ -170,14 +174,12 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -SYM_FUNC_END(ftrace_caller) +SYM_FUNC_END(ftrace_epilogue) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq - UNWIND_HINT_SAVE - /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -233,7 +235,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq ORIG_RAX(%rsp), %rax movq %rax, MCOUNT_REG_SIZE-8(%rsp) - /* If ORIG_RAX is anything but zero, make this a call to that */ + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ movq ORIG_RAX(%rsp), %rax cmpq $0, %rax je 1f @@ -244,20 +249,14 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq %rax, MCOUNT_REG_SIZE(%rsp) restore_mcount_regs 8 + /* Restore flags */ + popfq - jmp 2f +SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue 1: restore_mcount_regs - - -2: - /* - * The stack layout is nondetermistic here, depending on which path was - * taken. This confuses objtool and ORC, rightfully so. For now, - * pretend the stack always looks like the non-direct case. - */ - UNWIND_HINT_RESTORE - /* Restore flags */ popfq @@ -268,7 +267,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * to the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) - jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) -- cgit From dc2745b61907cf6faeb72cc25f2cc4b38d4a3cac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 16:50:40 +0200 Subject: x86,ftrace: Use SIZEOF_PTREGS There's a convenient macro for 'SS+8' called FRAME_SIZE. Use it to clarify things. (entry/calling.h calls this SIZEOF_PTREGS but we're using asm/ptrace-abi.h) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.808485515@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7657dc782828..be9aff20dd5f 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -23,7 +23,7 @@ #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) /* * gcc -pg option adds a call to 'mcount' in most functions. @@ -77,7 +77,7 @@ /* * We add enough stack to save all regs. */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rcx, RCX(%rsp) movq %rdx, RDX(%rsp) -- cgit From 9f2dfd61dd022d4559d42a832fb03e76aad36c5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 16:51:11 +0200 Subject: x86,ftrace: Shrink ftrace_regs_caller() by one byte 'Optimize' ftrace_regs_caller. Instead of comparing against an immediate, the more natural way to test for zero on x86 is: 'test %r,%r'. 48 83 f8 00 cmp $0x0,%rax 74 49 je 226 48 85 c0 test %rax,%rax 74 49 je 225 Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.867411350@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index be9aff20dd5f..9738ed23964e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -240,8 +240,8 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * See arch_ftrace_set_direct_caller(). */ movq ORIG_RAX(%rsp), %rax - cmpq $0, %rax - je 1f + testq %rax, %rax + jz 1f /* Swap the flags with orig_rax */ movq MCOUNT_REG_SIZE(%rsp), %rdi -- cgit From 9adbf3c609af92a57a73000a3cb8f4c2d307dfa3 Mon Sep 17 00:00:00 2001 From: Mihai Carabas Date: Tue, 21 Apr 2020 22:28:38 +0300 Subject: x86/microcode: Fix return value for microcode late loading The return value from stop_machine() might not be consistent. stop_machine_cpuslocked() returns: - zero if all functions have returned 0. - a non-zero value if at least one of the functions returned a non-zero value. There is no way to know if it is negative or positive. So make __reload_late() return 0 on success or negative otherwise. [ bp: Unify ret val check and touch up. ] Signed-off-by: Mihai Carabas Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/1587497318-4438-1-git-send-email-mihai.carabas@oracle.com --- arch/x86/kernel/cpu/microcode/core.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7019d4b2df0c..baec68b7e010 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -545,8 +545,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout) /* * Returns: * < 0 - on error - * 0 - no update done - * 1 - microcode was updated + * 0 - success (no update done or microcode was updated) */ static int __reload_late(void *info) { @@ -573,11 +572,11 @@ static int __reload_late(void *info) else goto wait_for_siblings; - if (err > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; - } else if (err == UCODE_UPDATED || err == UCODE_OK) { - ret = 1; } wait_for_siblings: @@ -608,7 +607,7 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret > 0) + if (ret == 0) microcode_check(); pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); @@ -649,7 +648,7 @@ static ssize_t reload_store(struct device *dev, put: put_online_cpus(); - if (ret >= 0) + if (ret == 0) ret = size; return ret; -- cgit From d8f0b35331c4423e033f81f10eb5e0c7e4e1dcec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:29 +0200 Subject: x86/cpu: Uninline CR4 accessors cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. The various CR4 accessors require cpu_tlbstate as the CR4 shadow cache is located there. In preparation for unexporting cpu_tlbstate, create a builtin function for manipulating CR4 and rework the various helpers to use it. No functional change. [ bp: push the export of native_write_cr4() only when CONFIG_LKTDM=m to the last patch in the series. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092558.939985695@linutronix.de --- arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++- arch/x86/kernel/process.c | 11 +++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..82042f40fc45 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -387,7 +387,28 @@ set_register: bits_missing); } } -EXPORT_SYMBOL(native_write_cr4); +EXPORT_SYMBOL_GPL(native_write_cr4); + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); void cr4_init(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..f2eab49d044e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void) preempt_enable(); } +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; -- cgit From 9020d3956317d052cdddd43e55acdd2970344192 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:31 +0200 Subject: x86/alternatives: Move temporary_mm helpers into C The only user of these inlines is the text poke code and this must not be exposed to the world. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.139069561@linutronix.de --- arch/x86/kernel/alternative.c | 55 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7867dfb3963e..cd617979b7fc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -783,6 +783,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -- cgit From 2faf153bb7346b7dfc895f916edf93a86297ec0a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:32 +0200 Subject: x86/tlb: Move __flush_tlb() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a first step, move __flush_tlb() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the namespace while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.246130908@linutronix.de --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/paravirt.c | 7 +------ 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 51b9190c628b..23ad8e953dfb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); @@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c131ba4e70ef..4cb3d822ea09 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,11 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -static void native_flush_tlb(void) -{ - __native_flush_tlb(); -} - /* * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. @@ -359,7 +354,7 @@ struct paravirt_patch_template pv_ops = { #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ - .mmu.flush_tlb_user = native_flush_tlb, + .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_others = native_flush_tlb_others, -- cgit From cd30d26cf307b45159cd629d60b989e582372afe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:33 +0200 Subject: x86/tlb: Move __flush_tlb_global() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a second step, move __flush_tlb_global() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the namespace while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.336916818@linutronix.de --- arch/x86/kernel/paravirt.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4cb3d822ea09..6094b007979c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,15 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -static void native_flush_tlb_global(void) -{ - __native_flush_tlb_global(); -} - static void native_flush_tlb_one_user(unsigned long addr) { __native_flush_tlb_one_user(addr); -- cgit From 127ac915c8e1c11b8209393e700ca16be0efabe8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:34 +0200 Subject: x86/tlb: Move __flush_tlb_one_user() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a third step, move _flush_tlb_one_user() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the name space while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.428213098@linutronix.de --- arch/x86/kernel/paravirt.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6094b007979c..5638e4ae2ea6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,11 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -static void native_flush_tlb_one_user(unsigned long addr) -{ - __native_flush_tlb_one_user(addr); -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -- cgit From 21953ee5013d6632bee90ec89f2df59c69050db0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 26 Apr 2020 18:55:15 +0200 Subject: x86/cpu: Export native_write_cr4() only when CONFIG_LKTDM=m Modules have no business poking into this but fixing this is for later. [ bp: Carve out from an earlier patch. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200421092558.939985695@linutronix.de --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82042f40fc45..eab3ebd22927 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -387,7 +387,9 @@ set_register: bits_missing); } } +#if IS_MODULE(CONFIG_LKDTM) EXPORT_SYMBOL_GPL(native_write_cr4); +#endif void cr4_update_irqsoff(unsigned long set, unsigned long clear) { -- cgit From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/x86/kernel/itmt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; -- cgit From 694cfd87b0c8a48af2f1afb225563571c0b975c4 Mon Sep 17 00:00:00 2001 From: "Ronald G. Minnich" Date: Sat, 25 Apr 2020 18:10:21 -0700 Subject: x86/setup: Add an initrdmem= option to specify initrd physical address Add the initrdmem option: initrdmem=ss[KMG],nn[KMG] which is used to specify the physical address of the initrd, almost always an address in FLASH. Also add code for x86 to use the existing phys_init_start and phys_init_size variables in the kernel. This is useful in cases where a kernel and an initrd is placed in FLASH, but there is no firmware file system structure in the FLASH. One such situation occurs when unused FLASH space on UEFI systems has been reclaimed by, e.g., taking it from the Management Engine. For example, on many systems, the ME is given half the FLASH part; not only is 2.75M of an 8M part unused; but 10.75M of a 16M part is unused. This space can be used to contain an initrd, but need to tell Linux where it is. This space is "raw": due to, e.g., UEFI limitations: it can not be added to UEFI firmware volumes without rebuilding UEFI from source or writing a UEFI device driver. It can be referenced only as a physical address and size. At the same time, if a kernel can be "netbooted" or loaded from GRUB or syslinux, the option of not using the physical address specification should be available. Then, it is easy to boot the kernel and provide an initrd; or boot the the kernel and let it use the initrd in FLASH. In practice, this has proven to be very helpful when integrating Linux into FLASH on x86. Hence, the most flexible and convenient path is to enable the initrdmem command line option in a way that it is the last choice tried. For example, on the DigitalLoggers Atomic Pi, an image into FLASH can be burnt in with a built-in command line which includes: initrdmem=0xff968000,0x200000 which specifies a location and size. [ bp: Massage commit message, make it passive. ] [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ronald G. Minnich Signed-off-by: Andrew Morton Signed-off-by: Borislav Petkov Reviewed-by: H. Peter Anvin (Intel) Link: http://lkml.kernel.org/r/CAP6exYLK11rhreX=6QPyDQmW7wPHsKNEFtXE47pjx41xS6O7-A@mail.gmail.com Link: https://lkml.kernel.org/r/20200426011021.1cskg0AGd%akpm@linux-foundation.org --- arch/x86/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4b3fa6cd3106..a3767e74c758 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -237,6 +237,9 @@ static u64 __init get_ramdisk_image(void) ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + if (ramdisk_image == 0) + ramdisk_image = phys_initrd_start; + return ramdisk_image; } static u64 __init get_ramdisk_size(void) @@ -245,6 +248,9 @@ static u64 __init get_ramdisk_size(void) ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + if (ramdisk_size == 0) + ramdisk_size = phys_initrd_size; + return ramdisk_size; } -- cgit From 767dea211cd0c68d8116d8c3b5104e82454fb44b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2020 07:17:03 +0200 Subject: x86/tboot: Mark tboot static This structure is only really used in tboot.c. The only exception is a single tboot_enabled check, but for that we don't need an inline function. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200428051703.1625952-1-hch@lst.de --- arch/x86/kernel/tboot.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b89f6ac6a0c0..b2942b2dbfcf 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -35,8 +35,7 @@ #include "../realmode/rm/wakeup.h" /* Global pointer to shared data; NULL means no measured launch. */ -struct tboot *tboot __read_mostly; -EXPORT_SYMBOL(tboot); +static struct tboot *tboot __read_mostly; /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -46,6 +45,11 @@ EXPORT_SYMBOL(tboot); static u8 tboot_uuid[16] __initdata = TBOOT_UUID; +bool tboot_enabled(void) +{ + return tboot != NULL; +} + void __init tboot_probe(void) { /* Look for valid page-aligned address for shared page. */ -- cgit From 34fdce6981b96920ced4e0ee56e9db3fb03a33f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 17:16:40 +0200 Subject: x86: Change {JMP,CALL}_NOSPEC argument In order to change the {JMP,CALL}_NOSPEC macros to call out-of-line versions of the retpoline magic, we need to remove the '%' from the argument, such that we can paste it onto symbol names. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200428191700.151623523@infradead.org --- arch/x86/kernel/ftrace_32.S | 2 +- arch/x86/kernel/ftrace_64.S | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..e405fe1a8bf4 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 9738ed23964e..aa5d28aeb31e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -301,7 +301,7 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs jmp fgraph_trace @@ -338,6 +338,6 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - JMP_NOSPEC %rdi + JMP_NOSPEC rdi SYM_CODE_END(return_to_handler) #endif -- cgit From 3a4ac121c2cacbf97d493fa3bc42ead88657abe4 Mon Sep 17 00:00:00 2001 From: CodyYao-oc Date: Mon, 13 Apr 2020 11:14:29 +0800 Subject: x86/perf: Add hardware performance events support for Zhaoxin CPU. Zhaoxin CPU has provided facilities for monitoring performance via PMU (Performance Monitor Unit), but the functionality is unused so far. Therefore, add support for zhaoxin pmu to make performance related hardware events available. The PMU is mostly an Intel Architectural PerfMon-v2 with a novel errata for the ZXC line. It supports the following events: ----------------------------------------------------------------------------------------------------------------------------------- Event | Event | Umask | Description | Select | | ----------------------------------------------------------------------------------------------------------------------------------- cpu-cycles | 82h | 00h | unhalt core clock instructions | 00h | 00h | number of instructions at retirement. cache-references | 15h | 05h | number of fillq pushs at the current cycle. cache-misses | 1ah | 05h | number of l2 miss pushed by fillq. branch-instructions | 28h | 00h | counts the number of branch instructions retired. branch-misses | 29h | 00h | mispredicted branch instructions at retirement. bus-cycles | 83h | 00h | unhalt bus clock stalled-cycles-frontend | 01h | 01h | Increments each cycle the # of Uops issued by the RAT to RS. stalled-cycles-backend | 0fh | 04h | RS0/1/2/3/45 empty L1-dcache-loads | 68h | 05h | number of retire/commit load. L1-dcache-load-misses | 4bh | 05h | retired load uops whose data source followed an L1 miss. L1-dcache-stores | 69h | 06h | number of retire/commit Store,no LEA L1-dcache-store-misses | 62h | 05h | cache lines in M state evicted out of L1D due to Snoop HitM or dirty line replacement. L1-icache-loads | 00h | 03h | number of l1i cache access for valid normal fetch,including un-cacheable access. L1-icache-load-misses | 01h | 03h | number of l1i cache miss for valid normal fetch,including un-cacheable miss. L1-icache-prefetches | 0ah | 03h | number of prefetch. L1-icache-prefetch-misses | 0bh | 03h | number of prefetch miss. dTLB-loads | 68h | 05h | number of retire/commit load dTLB-load-misses | 2ch | 05h | number of load operations miss all level tlbs and cause a tablewalk. dTLB-stores | 69h | 06h | number of retire/commit Store,no LEA dTLB-store-misses | 30h | 05h | number of store operations miss all level tlbs and cause a tablewalk. dTLB-prefetches | 64h | 05h | number of hardware pte prefetch requests dispatched out of the prefetch FIFO. dTLB-prefetch-misses | 65h | 05h | number of hardware pte prefetch requests miss the l1d data cache. iTLB-load | 00h | 00h | actually counter instructions. iTLB-load-misses | 34h | 05h | number of code operations miss all level tlbs and cause a tablewalk. ----------------------------------------------------------------------------------------------------------------------------------- Reported-by: kbuild test robot Signed-off-by: CodyYao-oc Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1586747669-4827-1-git-send-email-CodyYao-oc@zhaoxin.com --- arch/x86/kernel/cpu/perfctr-watchdog.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; -- cgit From 3b4ff4eb904fef04c36b39052ca8eb31fa41fad0 Mon Sep 17 00:00:00 2001 From: He Zhe Date: Wed, 4 Mar 2020 14:39:07 +0800 Subject: x86/mcelog: Add compat_ioctl for 32-bit mcelog support A 32-bit version of mcelog issuing ioctls on /dev/mcelog causes errors like the following: MCE_GET_RECORD_LEN: Inappropriate ioctl for device This is due to a missing compat_ioctl callback. Assign to it compat_ptr_ioctl() as a generic implementation of the .compat_ioctl file operation to ioctl functions that either ignore the argument or pass a pointer to a compatible data type. [ bp: Massage commit message. ] Signed-off-by: He Zhe Signed-off-by: Borislav Petkov Acked-by: Tony Luck Link: https://lkml.kernel.org/r/1583303947-49858-1-git-send-email-zhe.he@windriver.com --- arch/x86/kernel/cpu/mce/dev-mcelog.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index c033e7ea9e3c..a4fd5287f02f 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -329,6 +329,7 @@ static const struct file_operations mce_chrdev_ops = { .write = mce_chrdev_write, .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, + .compat_ioctl = compat_ptr_ioctl, .llseek = no_llseek, }; -- cgit From c3b3f52476412a3899f2c65b220075aceb18dd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 12:12:53 +0200 Subject: signal: refactor copy_siginfo_to_user32 Factor out a copy_siginfo_to_external32 helper from copy_siginfo_to_user32 that fills out the compat_siginfo, but does so on a kernel space data structure. With that we can let architectures override copy_siginfo_to_user32 with their own implementations using copy_siginfo_to_external32. That allows moving the x32 SIGCHLD purely to x86 architecture code. As a nice side effect copy_siginfo_to_external32 also comes in handy for avoiding a set_fs() call in the coredump code later on. Contains improvements from Eric W. Biederman and Arnd Bergmann . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/x86/kernel/signal.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..f3df262e370b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,6 +37,7 @@ #include #ifdef CONFIG_X86_64 +#include #include #include #endif /* CONFIG_X86_64 */ @@ -511,6 +512,31 @@ Efault: } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } -- cgit From 8dd97c65185c5a63c668e5bd8a861c04f47a35ed Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:12 -0700 Subject: x86/resctrl: Rename asm/resctrl_sched.h to asm/resctrl.h asm/resctrl_sched.h is dedicated to the code used for configuration of the CPU resource control state when a task is scheduled. Rename resctrl_sched.h to resctrl.h in preparation of additions that will no longer make this file dedicated to work done during scheduling. No functional change. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/6914e0ef880b539a82a6d889f9423496d471ad1d.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 2 +- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d8cc5223b7ce..6f38c88226af 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include "internal.h" /* Mutex to protect rdtgroup access. */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..4bd28b388a1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include "../../events/perf_event.h" /* For X86_CONFIG() */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5a359d9fcc05..6276ae015945 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -29,7 +29,7 @@ #include -#include +#include #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 954b013cc585..538d4e8d6589 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include "process.h" diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5ef9d8f25b0e..0c169a5687e1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include #ifdef CONFIG_IA32_EMULATION -- cgit From 0118ad82c2a64ebcf15d7565ed35361407efadfa Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:13 -0700 Subject: x86/cpu: Move resctrl CPUID code to resctrl/ The function determining a platform's support and properties of cache occupancy and memory bandwidth monitoring (properties of X86_FEATURE_CQM_LLC) can be found among the common CPU code. After the feature's properties is populated in the per-CPU data the resctrl subsystem is the only consumer (via boot_cpu_data). Move the function that obtains the CPU information used by resctrl to the resctrl subsystem and rename it from init_cqm() to resctrl_cpu_detect(). The function continues to be called from the common CPU code. This move is done in preparation of the addition of some vendor specific code. No functional change. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/38433b99f9d16c8f4ee796f8cc42b871531fa203.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/common.c | 27 ++------------------------- arch/x86/kernel/cpu/resctrl/core.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..a8f0f22ee5c1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "cpu.h" @@ -854,30 +855,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -945,7 +922,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); + resctrl_cpu_detect(c); /* * Clear/Set all flags overridden by options, after probe. diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6f38c88226af..861c6d1ba9ab 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -958,6 +958,30 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; -- cgit From f0d339db56478e3bcd98d5e985d3d69cacf27549 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:14 -0700 Subject: x86/resctrl: Remove unnecessary RMID checks The cache and memory bandwidth monitoring properties are read using CPUID on every CPU. After the information is read from the system a sanity check is run to (1) ensure that the RMID data is initialized for the boot CPU in case the information was not available on the boot CPU and (2) the boot CPU's RMID is set to the minimum of RMID obtained from all CPUs. Every known platform that supports resctrl has the same maximum RMID on all CPUs. Both sanity checks found in x86_init_cache_qos() can thus safely be removed. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/c9a3b60d34091840c8b0bd1c6fab15e5ba92cb17.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/common.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a8f0f22ee5c1..556a96d05a6c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1354,20 +1354,6 @@ static void generic_identify(struct cpuinfo_x86 *c) #endif } -static void x86_init_cache_qos(struct cpuinfo_x86 *c) -{ - /* - * The heavy lifting of max_rmid and cache_occ_scale are handled - * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu - * in case CQM bits really aren't there in this CPU. - */ - if (c != &boot_cpu_data) { - boot_cpu_data.x86_cache_max_rmid = - min(boot_cpu_data.x86_cache_max_rmid, - c->x86_cache_max_rmid); - } -} - /* * Validate that ACPI/mptables have the same information about the * effective APIC id and update the package map. @@ -1480,7 +1466,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif x86_init_rdrand(c); - x86_init_cache_qos(c); setup_pku(c); /* -- cgit From 923f3a2b48bdccb6a1d1f0dd48de03de7ad936d9 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:15 -0700 Subject: x86/resctrl: Query LLC monitoring properties once during boot Cache and memory bandwidth monitoring are features that are part of x86 CPU resource control that is supported by the resctrl subsystem. The monitoring properties are obtained via CPUID from every CPU and only used within the resctrl subsystem where the properties are only read from boot_cpu_data. Obtain the monitoring properties once, placed in boot_cpu_data, via the ->c_bsp_init() helpers of the vendors that support X86_FEATURE_CQM_LLC. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/6d74a6ac3e69f4b7a8b4115835f9455faf0f468d.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/amd.c | 3 +++ arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/cpu/intel.c | 7 +++++++ arch/x86/kernel/cpu/resctrl/core.c | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 547ad7bbf0e0..c36e89930965 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -597,6 +598,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; } } + + resctrl_cpu_detect(c); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 556a96d05a6c..d07809286b95 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -56,7 +56,6 @@ #include #include #include -#include #include "cpu.h" @@ -922,7 +921,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - resctrl_cpu_detect(c); /* * Clear/Set all flags overridden by options, after probe. diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a19a680542ce..166d7c355896 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -322,6 +323,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) detect_ht_early(c); } +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + #ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 @@ -961,6 +967,7 @@ static const struct cpu_dev intel_cpu_dev = { #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 861c6d1ba9ab..d5979073301e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -958,6 +958,7 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +/* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { -- cgit From 46637d4570e108d1f6721cfa2cca1d078882761a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:16 -0700 Subject: x86/resctrl: Maintain MBM counter width per resource The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR, and the first-generation MBM implementation uses 24 bit counters. Software is required to poll at 1 second or faster to ensure that data is retrieved before a counter rollover occurs more than once under worst conditions. As system bandwidths scale the software requirement is maintained with the introduction of a per-resource enumerable MBM counter width. In preparation for supporting hardware with an enumerable MBM counter width the current globally static MBM counter width is moved to a per-resource MBM counter width. Currently initialized to 24 always to result in no functional change. In essence there is one function, mbm_overflow_count() that needs to know the counter width to handle rollovers. The static value used within mbm_overflow_count() will be replaced with a value discovered from the hardware. Support for learning the MBM counter width from hardware is added in the change that follows. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/e36743b9800f16ce600f86b89127391f61261f23.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 8 +++++--- arch/x86/kernel/cpu/resctrl/internal.h | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 21 +++++++++++++-------- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- 4 files changed, 24 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b531..934c8fb8a64a 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); if (rr.val & RMID_VAL_ERROR) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 3dd13f3a8b23..58b002c31655 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -87,6 +87,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; int evtid; bool first; @@ -460,6 +461,7 @@ struct rdt_resource { struct list_head evt_list; int num_rmid; unsigned int mon_scale; + unsigned int mbm_width; unsigned long fflags; }; @@ -587,8 +589,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id); void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 773124b0e18a..df964c03f6c6 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -214,9 +214,9 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; @@ -256,7 +256,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); m->chunks += chunks; m->prev_msr = tval; @@ -278,7 +278,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) return; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); m->chunks_bw += chunks; m->chunks = m->chunks_bw; cur_bw = (chunks * r->mon_scale) >> 20; @@ -433,11 +433,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -510,6 +511,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -517,16 +519,18 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + d = get_domain_from_cpu(cpu, r); if (!d) goto out_unlock; list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -619,6 +623,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r) r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH; /* * A reasonable upper limit on the max threshold is the number diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6276ae015945..d7cb5ab0d1f0 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2472,7 +2472,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; -- cgit From f3d44f18b0662327c42128b9d3604489bdb6e36f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:17 -0700 Subject: x86/resctrl: Support CPUID enumeration of MBM counter width The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR while the first-generation MBM implementation uses statically defined 24 bit counters. Expand the MBM CPUID enumeration properties to include the MBM counter width. The previously undefined EAX output register contains, in bits [7:0], the MBM counter width encoded as an offset from 24 bits. Enumerating this property is only specified for Intel CPUs. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/afa3af2f753f6bc301fb743bc8944e749cb24afa.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d5979073301e..12f967c6b603 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -964,6 +964,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; return; } @@ -980,6 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; } } -- cgit From 0c4d5ba1b998e713815b7790d3db6ced0ae49489 Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:18 -0700 Subject: x86/resctrl: Support wider MBM counters The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR while the first-generation MBM implementation uses statically defined 24 bit counters. The MBM CPUID enumeration properties have been expanded to include the MBM counter width, encoded as an offset from 24 bits. While eight bits are available for the counter width offset IA32_QM_CTR MSR only supports 62 bit counters. Add a sanity check, with warning printed when encountered, to ensure counters cannot exceed the 62 bit limit. Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/69d52abd5b14794d3a0f05ba7c755ed1f4c0d5ed.1588715690.git.reinette.chatre@intel.com --- arch/x86/kernel/cpu/resctrl/internal.h | 8 +++++++- arch/x86/kernel/cpu/resctrl/monitor.c | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 58b002c31655..f20a47d120b1 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -31,7 +31,7 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -40,6 +40,12 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index df964c03f6c6..837d7d012b7b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -618,12 +618,18 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; unsigned int cl_size = boot_cpu_data.x86_cache_size; int ret; r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; - r->mbm_width = MBM_CNTR_WIDTH; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number -- cgit From 66abf2388331b800f290e854cca3ae71de7977fe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 May 2020 19:27:16 +0200 Subject: x86/apic: Convert the TSC deadline timer matching to steppings macro ... and get rid of the function pointers which would spit out the microcode revision based on the CPU stepping. Signed-off-by: Borislav Petkov Reviewed-by: Mark Gross Cc: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200506071516.25445-4-bp@alien8.de --- arch/x86/kernel/apic/apic.c | 57 ++++++++++----------------------------------- 1 file changed, 12 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e53dda210cd7..4b1d31be50b4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -544,46 +544,20 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -static __init u32 hsx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x3a; /* EP */ - case 0x04: return 0x0f; /* EX */ - } - - return ~0U; -} - -static __init u32 bdx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x02: return 0x00000011; - case 0x03: return 0x0700000e; - case 0x04: return 0x0f00000c; - case 0x05: return 0x0e000003; - } - - return ~0U; -} - -static __init u32 skx_deadline_rev(void) -{ - switch (boot_cpu_data.x86_stepping) { - case 0x03: return 0x01000136; - case 0x04: return 0x02000014; - } +static const struct x86_cpu_id deadline_match[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ - if (boot_cpu_data.x86_stepping > 4) - return 0; + X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - return ~0U; -} + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), -static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), + X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), @@ -615,14 +589,7 @@ static __init bool apic_validate_deadline_timer(void) if (!m) return true; - /* - * Function pointers will have the MSB set due to address layout, - * immediate revisions will not. - */ - if ((long)m->driver_data < 0) - rev = ((u32 (*)(void))(m->driver_data))(); - else - rev = (u32)m->driver_data; + rev = (u32)m->driver_data; if (boot_cpu_data.microcode >= rev) return true; -- cgit From 565558558985b1d7cd43b21f18c1ad6b232788d0 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 30 Apr 2020 12:40:03 +0100 Subject: cpu/hotplug: Remove disable_nonboot_cpus() The single user could have called freeze_secondary_cpus() directly. Since this function was a source of confusion, remove it as it's just a pointless wrapper. While at it, rename enable_nonboot_cpus() to thaw_secondary_cpus() to preserve the naming symmetry. Done automatically via: git grep -l enable_nonboot_cpus | xargs sed -i 's/enable_nonboot_cpus/thaw_secondary_cpus/g' Signed-off-by: Qais Yousef Signed-off-by: Thomas Gleixner Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20200430114004.17477-1-qais.yousef@arm.com --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe3ab9632f3b..997b66c18154 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1376,12 +1376,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) speculative_store_bypass_ht_init(); } -void arch_enable_nonboot_cpus_begin(void) +void arch_thaw_secondary_cpus_begin(void) { set_mtrr_aps_delayed_init(); } -void arch_enable_nonboot_cpus_end(void) +void arch_thaw_secondary_cpus_end(void) { mtrr_aps_init(); } -- cgit From e4dd8b8351264fb5bef30c0a77e4d171e0603d63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:22 +0200 Subject: x86/platform/uv: Mark is_uv_hubless() static is_uv_hubless() is only used in x2apic_uv_x.c. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-7-hch@lst.de --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad53b2abc859..cb07a98771f9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -385,11 +385,10 @@ int is_uv_hubbed(int uvtype) } EXPORT_SYMBOL_GPL(is_uv_hubbed); -int is_uv_hubless(int uvtype) +static int is_uv_hubless(int uvtype) { return (uv_hubless_system & uvtype); } -EXPORT_SYMBOL_GPL(is_uv_hubless); void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); -- cgit From 8263b059379c4a95fe0181db1e5a2e9c2229d929 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:23 +0200 Subject: x86/platform/uv: Mark uv_min_hub_revision_id static This variable is only used inside x2apic_uv_x and not even declared in a header. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-8-hch@lst.de --- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index cb07a98771f9..f1a0142e2731 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -48,8 +48,7 @@ static struct { unsigned int gnode_shift; } uv_cpuid; -int uv_min_hub_revision_id; -EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static int uv_min_hub_revision_id; unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); -- cgit From 8e77554580250f1185cffd8d3ac6a9b01de05d60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:24 +0200 Subject: x86/platform/uv: Simplify uv_send_IPI_one() Merge two helpers only used by uv_send_IPI_one() into the main function. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-9-hch@lst.de --- arch/x86/kernel/apic/x2apic_uv_x.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f1a0142e2731..3830538095e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -588,12 +588,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void uv_send_IPI_one(int cpu, int vector) { - unsigned long apicid; - int pnode; + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; + else + dmode = dest_Fixed; - apicid = per_cpu(x86_cpu_to_apicid, cpu); - pnode = uv_apicid_to_pnode(apicid); - uv_hub_send_ipi(pnode, apicid, vector); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(const struct cpumask *mask, int vector) -- cgit From fbe1d37866d249b6936cf526592957725a941f95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:25 +0200 Subject: x86/platform/uv: Remove _uv_hub_info_check() Neither this functions nor the helpers used to implement it are used anywhere in the kernel tree. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-10-hch@lst.de --- arch/x86/kernel/apic/x2apic_uv_x.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3830538095e6..8cf0e24cf883 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -415,12 +415,6 @@ static __initdata struct uv_gam_range_s *_gr_table; #define SOCK_EMPTY ((unsigned short)~0) -extern int uv_hub_info_version(void) -{ - return UV_HUB_INFO_VERSION; -} -EXPORT_SYMBOL(uv_hub_info_version); - /* Default UV memory block size is 2GB */ static unsigned long mem_block_size __initdata = (2UL << 30); -- cgit From 479d6d904557b774c1bd5b0338d8111dd2b322ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:26 +0200 Subject: x86/platform/uv: Unexport uv_apicid_hibits This variable is not used by modular code. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200504171527.2845224-11-hch@lst.de --- arch/x86/kernel/apic/x2apic_uv_x.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8cf0e24cf883..10339ad02033 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -51,7 +51,6 @@ static struct { static int uv_min_hub_revision_id; unsigned int uv_apicid_hibits; -EXPORT_SYMBOL_GPL(uv_apicid_hibits); static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; -- cgit From cec5f268cd02d25d2d74807843d8ae0292fe0fb7 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:56 -0700 Subject: x86/delay: Introduce TPAUSE delay TPAUSE instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt or exceptions such as a debug exception or a machine check exception. TPAUSE offers a choice of two lower power states: 1. Light-weight power/performance optimized state C0.1 2. Improved power/performance optimized state C0.2 This way, it can save power with low wake-up latency in comparison to spinloop based delay. The selection between the two is governed by the input register. TPAUSE is available on processors with X86_FEATURE_WAITPKG. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-4-git-send-email-kyung.min.park@intel.com --- arch/x86/kernel/time.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..371a6b348e44 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -103,6 +103,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* -- cgit From e2abfc0448a46d8a137505aa180caf14070ec535 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Fri, 17 Apr 2020 09:33:56 -0500 Subject: x86/cpu/amd: Make erratum #1054 a legacy erratum Commit 21b5ee59ef18 ("x86/cpu/amd: Enable the fixed Instructions Retired counter IRPERF") mistakenly added erratum #1054 as an OS Visible Workaround (OSVW) ID 0. Erratum #1054 is not OSVW ID 0 [1], so make it a legacy erratum. There would never have been a false positive on older hardware that has OSVW bit 0 set, since the IRPERF feature was not available. However, save a couple of RDMSR executions per thread, on modern system configurations that correctly set non-zero values in their OSVW_ID_Length MSRs. [1] Revision Guide for AMD Family 17h Models 00h-0Fh Processors. The revision guide is available from the bugzilla link below. Fixes: 21b5ee59ef18 ("x86/cpu/amd: Enable the fixed Instructions Retired counter IRPERF") Reported-by: Andrew Cooper Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200417143356.26054-1-kim.phillips@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 --- arch/x86/kernel/cpu/amd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 547ad7bbf0e0..8a1bdda895a4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1142,8 +1142,7 @@ static const int amd_erratum_383[] = /* #1054: Instructions Retired Performance Counter May Be Inaccurate */ static const int amd_erratum_1054[] = - AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - + AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { -- cgit From 1d05334d2899bd3ecdf01beb53f0a70884a7f471 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 10:24:45 -0500 Subject: livepatch: Remove .klp.arch After the previous patch, vmlinux-specific KLP relocations are now applied early during KLP module load. This means that .klp.arch sections are no longer needed for *vmlinux-specific* KLP relocations. One might think they're still needed for *module-specific* KLP relocations. If a to-be-patched module is loaded *after* its corresponding KLP module is loaded, any corresponding KLP relocations will be delayed until the to-be-patched module is loaded. If any special sections (.parainstructions, for example) rely on those relocations, their initializations (apply_paravirt) need to be done afterwards. Thus the apparent need for arch_klp_init_object_loaded() and its corresponding .klp.arch sections -- it allows some of the special section initializations to be done at a later time. But... if you look closer, that dependency between the special sections and the module-specific KLP relocations doesn't actually exist in reality. Looking at the contents of the .altinstructions and .parainstructions sections, there's not a realistic scenario in which a KLP module's .altinstructions or .parainstructions section needs to access a symbol in a to-be-patched module. It might need to access a local symbol or even a vmlinux symbol; but not another module's symbol. When a special section needs to reference a local or vmlinux symbol, a normal rela can be used instead of a KLP rela. Since the special section initializations don't actually have any real dependency on module-specific KLP relocations, .klp.arch and arch_klp_init_object_loaded() no longer have a reason to exist. So remove them. As Peter said much more succinctly: So the reason for .klp.arch was that .klp.rela.* stuff would overwrite paravirt instructions. If that happens you're doing it wrong. Those RELAs are core kernel, not module, and thus should've happened in .rela.* sections at patch-module loading time. Reverting this removes the two apply_{paravirt,alternatives}() calls from the late patching path, and means we don't have to worry about them when removing module_disable_ro(). [ jpoimboe: Rewrote patch description. Tweaked klp_init_object_loaded() error path. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/livepatch.c | 53 --------------------------------------------- 2 files changed, 54 deletions(-) delete mode 100644 arch/x86/kernel/livepatch.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..bae9f9033734 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -90,7 +90,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_LIVEPATCH) += livepatch.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c deleted file mode 100644 index 6a68e41206e7..000000000000 --- a/arch/x86/kernel/livepatch.c +++ /dev/null @@ -1,53 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * livepatch.c - x86-specific Kernel Live Patching Core - */ - -#include -#include -#include -#include - -/* Apply per-object alternatives. Based on x86 module_finalize() */ -void arch_klp_init_object_loaded(struct klp_patch *patch, - struct klp_object *obj) -{ - int cnt; - struct klp_modinfo *info; - Elf_Shdr *s, *alt = NULL, *para = NULL; - void *aseg, *pseg; - const char *objname; - char sec_objname[MODULE_NAME_LEN]; - char secname[KSYM_NAME_LEN]; - - info = patch->mod->klp_info; - objname = obj->name ? obj->name : "vmlinux"; - - /* See livepatch core code for BUILD_BUG_ON() explanation */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); - - for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) { - /* Apply per-object .klp.arch sections */ - cnt = sscanf(info->secstrings + s->sh_name, - ".klp.arch.%55[^.].%127s", - sec_objname, secname); - if (cnt != 2) - continue; - if (strcmp(sec_objname, objname)) - continue; - if (!strcmp(".altinstructions", secname)) - alt = s; - if (!strcmp(".parainstructions", secname)) - para = s; - } - - if (alt) { - aseg = (void *) alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - - if (para) { - pseg = (void *) para->sh_addr; - apply_paravirt(pseg, pseg + para->sh_size); - } -} -- cgit From 88fc078a7a8f67e47020d73d8d14ed11f03754ab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 10:24:49 -0500 Subject: x86/module: Use text_poke() for late relocations Because of late module patching, a livepatch module needs to be able to apply some of its relocations well after it has been loaded. Instead of playing games with module_{dis,en}able_ro(), use existing text poking mechanisms to apply relocations after module loading. So far only x86, s390 and Power have HAVE_LIVEPATCH but only the first two also have STRICT_MODULE_RWX. This will allow removal of the last module_disable_ro() usage in livepatch. The ultimate goal is to completely disallow making executable mappings writable. [ jpoimboe: Split up patches. Use mod state to determine whether memcpy() can be used. Implement text_poke() for UML. ] Cc: x86@kernel.org Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra (Intel) Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- arch/x86/kernel/module.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d5c72cb877b3..7614f478fd7a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -126,11 +126,12 @@ int apply_relocate(Elf32_Shdr *sechdrs, return 0; } #else /*X86_64*/ -int apply_relocate_add(Elf64_Shdr *sechdrs, +static int __apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *me) + struct module *me, + void *(*write)(void *dest, const void *src, size_t len)) { unsigned int i; Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; @@ -162,19 +163,19 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, case R_X86_64_64: if (*(u64 *)loc != 0) goto invalid_relocation; - *(u64 *)loc = val; + write(loc, &val, 8); break; case R_X86_64_32: if (*(u32 *)loc != 0) goto invalid_relocation; - *(u32 *)loc = val; + write(loc, &val, 4); if (val != *(u32 *)loc) goto overflow; break; case R_X86_64_32S: if (*(s32 *)loc != 0) goto invalid_relocation; - *(s32 *)loc = val; + write(loc, &val, 4); if ((s64)val != *(s32 *)loc) goto overflow; break; @@ -183,7 +184,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u32 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u32 *)loc = val; + write(loc, &val, 4); #if 0 if ((s64)val != *(s32 *)loc) goto overflow; @@ -193,7 +194,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, if (*(u64 *)loc != 0) goto invalid_relocation; val -= (u64)loc; - *(u64 *)loc = val; + write(loc, &val, 8); break; default: pr_err("%s: Unknown rela relocation: %llu\n", @@ -215,6 +216,29 @@ overflow: me->name); return -ENOEXEC; } + +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + int ret; + bool early = me->state == MODULE_STATE_UNFORMED; + void *(*write)(void *, const void *, size_t) = memcpy; + + if (!early) + write = text_poke; + + ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, + write); + + if (!early) + text_poke_sync(); + + return ret; +} + #endif int module_finalize(const Elf_Ehdr *hdr, -- cgit From 5b384f933590a086ca9a0abdc2e55e41107ac440 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 29 Apr 2020 10:24:52 -0500 Subject: x86/module: Use text_mutex in apply_relocate_add() Now that the livepatch code no longer needs the text_mutex for changing module permissions, move its usage down to apply_relocate_add(). Note the s390 version of apply_relocate_add() doesn't need to use the text_mutex because it already uses s390_kernel_write_lock, which accomplishes the same task. Signed-off-by: Josh Poimboeuf Acked-by: Joe Lawrence Acked-by: Miroslav Benes Signed-off-by: Jiri Kosina --- arch/x86/kernel/module.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 7614f478fd7a..23c95a53d20e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -227,14 +228,18 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, bool early = me->state == MODULE_STATE_UNFORMED; void *(*write)(void *, const void *, size_t) = memcpy; - if (!early) + if (!early) { write = text_poke; + mutex_lock(&text_mutex); + } ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me, write); - if (!early) + if (!early) { text_poke_sync(); + mutex_unlock(&text_mutex); + } return ret; } -- cgit From 5274e6c172c47241534e970df26a522497086624 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:35 -0700 Subject: x86/fpu/xstate: Rename validate_xstate_header() to validate_user_xstate_header() The function validate_xstate_header() validates an xstate header coming from userspace (PTRACE or sigreturn). To make it clear, rename it to validate_user_xstate_header(). Suggested-by: Dave Hansen Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200512145444.15483-2-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d652b939ccfb..bd1d0649f8ce 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); if (!ret) - ret = validate_xstate_header(&xsave->header); + ret = validate_user_xstate_header(&xsave->header); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 400a05e1c1c5..585e3651b98f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -366,7 +366,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&fpu->state.xsave.header); + ret = validate_user_xstate_header(&fpu->state.xsave.header); } if (ret) goto err_out; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 32b153d38748..8ed64397c78b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -472,7 +472,7 @@ int using_compacted_format(void) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_xstate_header(const struct xstate_header *hdr) +int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) @@ -1147,7 +1147,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1201,7 +1201,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit From 8ab22804efefea9ecf3c68aa00f1fa69c70fcfad Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:36 -0700 Subject: x86/fpu/xstate: Define new macros for supervisor and user xstates XCNTXT_MASK is 'all supported xfeatures' before introducing supervisor xstates. Rename it to XFEATURE_MASK_USER_SUPPORTED to make clear that these are user xstates. Replace XFEATURE_MASK_SUPERVISOR with the following: - XFEATURE_MASK_SUPERVISOR_SUPPORTED: Currently nothing. ENQCMD and Control-flow Enforcement Technology (CET) will be introduced in separate series. - XFEATURE_MASK_SUPERVISOR_UNSUPPORTED: Currently only Processor Trace. - XFEATURE_MASK_SUPERVISOR_ALL: the combination of above. Co-developed-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-3-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/init.c | 3 ++- arch/x86/kernel/fpu/xstate.c | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6ce7e0a23268..61ddc3a5e5c2 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ u64 __init fpu__get_supported_xfeatures_mask(void) { - return XCNTXT_MASK; + return XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; } /* Legacy code to initialize eager fpu mode. */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8ed64397c78b..9997df717339 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -208,14 +208,13 @@ void fpu__init_cpu_xstate(void) if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; /* - * Make it clear that XSAVES supervisor states are not yet - * implemented should anyone expect it to work by changing - * bits in XFEATURE_MASK_* macros and XCR0. + * Unsupported supervisor xstates should not be found in + * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), - "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED), + "x86/fpu: Found unsupported supervisor xstates.\n"); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); @@ -438,7 +437,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) * format. Checking a supervisor state's uncompacted offset is * an error. */ - if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) { + if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); return -1; } @@ -475,7 +474,7 @@ int using_compacted_format(void) int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED)) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -768,7 +767,8 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + update_regset_xstate_info(fpu_user_xstate_size, + xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -996,7 +996,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; /* * Copy xregs_state->header: @@ -1080,7 +1080,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; /* * Copy xregs_state->header: @@ -1173,7 +1173,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1229,7 +1229,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: -- cgit From 524bb73bc15c56f5587e33c817e103a259b019d2 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:37 -0700 Subject: x86/fpu/xstate: Separate user and supervisor xfeatures mask Before the introduction of XSAVES supervisor states, 'xfeatures_mask' is used at various places to determine XSAVE buffer components and XCR0 bits. It contains only user xstates. To support supervisor xstates, it is necessary to separate user and supervisor xstates: - First, change 'xfeatures_mask' to 'xfeatures_mask_all', which represents the full set of bits that should ever be set in a kernel XSAVE buffer. - Introduce xfeatures_mask_supervisor() and xfeatures_mask_user() to extract relevant xfeatures from xfeatures_mask_all. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-4-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/signal.c | 16 +++++++--- arch/x86/kernel/fpu/xstate.c | 73 ++++++++++++++++++++++++++------------------ 2 files changed, 54 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 585e3651b98f..3df0cfae535f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -252,13 +252,17 @@ sanitize_restored_xstate(union fpregs_state *state, */ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { + u64 init_bv; + if (use_xsave()) { if (fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); } else { - u64 init_bv = xfeatures_mask & ~xbv; + init_bv = xfeatures_mask_user() & ~xbv; + if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_xregs(buf, xbv); @@ -358,7 +362,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; + u64 init_bv = xfeatures_mask_user() & ~xfeatures; if (using_compacted_format()) { ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); @@ -389,7 +393,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_lock(); if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + u64 init_bv; + + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } @@ -465,7 +471,7 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xfeatures = xfeatures_mask_user(); fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9997df717339..fa71af643025 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -54,9 +54,10 @@ static short xsave_cpuid_features[] __initdata = { }; /* - * Mask of xstate features supported by the CPU and the kernel: + * This represents the full set of bits that should ever be set in a kernel + * XSAVE buffer, both supervisor and user xstates. */ -u64 xfeatures_mask __read_mostly; +u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; @@ -76,7 +77,7 @@ unsigned int fpu_user_xstate_size; */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -150,7 +151,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) + if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) return; /* @@ -177,7 +178,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * in a special way already: */ feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; /* * Update all the remaining memory layouts according to their @@ -205,19 +206,28 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) + u64 unsup_bits; + + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) return; /* * Unsupported supervisor xstates should not be found in * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED), - "x86/fpu: Found unsupported supervisor xstates.\n"); + unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", + unsup_bits); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); } /* @@ -225,9 +235,9 @@ void fpu__init_cpu_xstate(void) * functions here: one for user xstates and the other for * system xstates. For now, they are the same. */ -static int xfeature_enabled(enum xfeature xfeature) +static bool xfeature_enabled(enum xfeature xfeature) { - return !!(xfeatures_mask & (1UL << xfeature)); + return xfeatures_mask_all & BIT_ULL(xfeature); } /* @@ -414,7 +424,7 @@ static void __init setup_init_fpu_buf(void) if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; + xfeatures_mask_all; /* * Init all the features state with header.xfeatures being 0x0 @@ -474,7 +484,7 @@ int using_compacted_format(void) int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED)) + if (hdr->xfeatures & ~xfeatures_mask_user()) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -609,7 +619,7 @@ static void do_extra_xstate_size_checks(void) /* - * Get total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -699,7 +709,7 @@ static int __init init_xstate_size(void) */ static void fpu__init_disable_system_xstate(void) { - xfeatures_mask = 0; + xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); } @@ -734,16 +744,21 @@ void __init fpu__init_system_xstate(void) return; } + /* + * Find user xstates supported by the processor. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); + xfeatures_mask_all = eax + ((u64)edx << 32); - if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* Place supervisor features in xfeatures_mask_all here */ + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + xfeatures_mask_all); goto out_disable; } @@ -752,10 +767,10 @@ void __init fpu__init_system_xstate(void) */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask &= ~BIT(i); + xfeatures_mask_all &= ~BIT_ULL(i); } - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -767,8 +782,7 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, - xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED); + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -776,7 +790,7 @@ void __init fpu__init_system_xstate(void) print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask, + xfeatures_mask_all, fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; @@ -795,7 +809,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); } /* @@ -840,10 +854,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that xfeatures_mask is - * what we write to the XCR0 register. + * have not enabled. */ - WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -996,7 +1009,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: @@ -1080,7 +1093,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: -- cgit From 71581eefd7a0a81b1af7d7c93641925a01d70a9a Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:38 -0700 Subject: x86/fpu/xstate: Introduce XSAVES supervisor states Enable XSAVES supervisor states by setting MSR_IA32_XSS bits according to CPUID enumeration results. Also revise comments at various places. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-5-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/xstate.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index fa71af643025..a68213ed5be6 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -228,13 +228,14 @@ void fpu__init_cpu_xstate(void) * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * MSR_IA32_XSS sets supervisor states managed by XSAVES. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } -/* - * Note that in the future we will likely need a pair of - * functions here: one for user xstates and the other for - * system xstates. For now, they are the same. - */ static bool xfeature_enabled(enum xfeature xfeature) { return xfeatures_mask_all & BIT_ULL(xfeature); @@ -625,9 +626,6 @@ static void do_extra_xstate_size_checks(void) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. - * - * Note that we do not currently set any bits on IA32_XSS so - * 'XCR0 | IA32_XSS == XCR0' for now. */ static unsigned int __init get_xsaves_size(void) { @@ -750,7 +748,12 @@ void __init fpu__init_system_xstate(void) cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xfeatures_mask_all = eax + ((u64)edx << 32); - /* Place supervisor features in xfeatures_mask_all here */ + /* + * Find supervisor xstates supported by the processor. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + xfeatures_mask_all |= ecx + ((u64)edx << 32); + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened @@ -810,6 +813,13 @@ void fpu__resume_cpu(void) */ if (boot_cpu_has(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); + + /* + * Restore IA32_XSS. The same CPUID bit enumerates support + * of XSAVES and MSR_IA32_XSS. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); } /* -- cgit From b860eb8dce5906b14e3a7f3c771e0b3d6ef61b94 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:39 -0700 Subject: x86/fpu/xstate: Define new functions for clearing fpregs and xstates Currently, fpu__clear() clears all fpregs and xstates. Once XSAVES supervisor states are introduced, supervisor settings (e.g. CET xstates) must remain active for signals; It is necessary to have separate functions: - Create fpu__clear_user_states(): clear only user settings for signals; - Create fpu__clear_all(): clear both user and supervisor settings in flush_thread(). Also modify copy_init_fpstate_to_fpregs() to take a mask from above two functions. Remove obvious side-comment in fpu__clear(), while at it. [ bp: Make the second argument of fpu__clear() bool after requesting it a bunch of times during review. - Add a comment about copy_init_fpstate_to_fpregs() locking needs. ] Co-developed-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-6-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/core.c | 53 +++++++++++++++++++++++++++++--------------- arch/x86/kernel/fpu/signal.c | 4 ++-- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/signal.c | 2 +- 4 files changed, 39 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12c70840980e..06c818967bb6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -291,15 +291,13 @@ void fpu__drop(struct fpu *fpu) } /* - * Clear FPU registers by setting them up from - * the init fpstate: + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(void) +static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { - fpregs_lock(); - if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, -1); + copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else @@ -307,9 +305,6 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); - - fpregs_mark_activate(); - fpregs_unlock(); } /* @@ -318,18 +313,40 @@ static inline void copy_init_fpstate_to_fpregs(void) * Called by sys_execve(), by the signal handler code and by various * error paths. */ -void fpu__clear(struct fpu *fpu) +static void fpu__clear(struct fpu *fpu, bool user_only) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); - fpu__drop(fpu); + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpu__drop(fpu); + fpu__initialize(fpu); + return; + } - /* - * Make sure fpstate is cleared and initialized. - */ - fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) - copy_init_fpstate_to_fpregs(); + fpregs_lock(); + + if (user_only) { + if (!fpregs_state_valid(fpu, smp_processor_id()) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + } else { + copy_init_fpstate_to_fpregs(xfeatures_mask_all); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu__clear_user_states(struct fpu *fpu) +{ + fpu__clear(fpu, true); +} + +void fpu__clear_all(struct fpu *fpu) +{ + fpu__clear(fpu, false); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3df0cfae535f..cd6eafba12da 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -289,7 +289,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { - fpu__clear(fpu); + fpu__clear_user_states(fpu); return 0; } @@ -416,7 +416,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) err_out: if (ret) - fpu__clear(fpu); + fpu__clear_user_states(fpu); return ret; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..de182b84723a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -191,7 +191,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(&tsk->thread.fpu); + fpu__clear_all(&tsk->thread.fpu); } void disable_TSC(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..0052bbe5dfd4 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -732,7 +732,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - fpu__clear(fpu); + fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } -- cgit From 5d6b6a6f9b5ce7ac42273efd75d61ec63b463c18 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:40 -0700 Subject: x86/fpu/xstate: Update sanitize_restored_xstate() for supervisor xstates The function sanitize_restored_xstate() sanitizes user xstates of an XSAVE buffer by clearing bits not in the input 'xfeatures' from the buffer's header->xfeatures, effectively resetting those features back to the init state. When supervisor xstates are introduced, it is necessary to make sure only user xstates are sanitized. Ensure supervisor bits in header->xfeatures stay set and supervisor states are not modified. To make names clear, also: - Rename the function to sanitize_restored_user_xstate(). - Rename input parameter 'xfeatures' to 'user_xfeatures'. - In __fpu__restore_sig(), rename 'xfeatures' to 'user_xfeatures'. Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20200512145444.15483-7-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/signal.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index cd6eafba12da..77e5c2e34ee6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -211,9 +211,9 @@ retry: } static inline void -sanitize_restored_xstate(union fpregs_state *state, - struct user_i387_ia32_struct *ia32_env, - u64 xfeatures, int fx_only) +sanitize_restored_user_xstate(union fpregs_state *state, + struct user_i387_ia32_struct *ia32_env, + u64 user_xfeatures, int fx_only) { struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; @@ -226,13 +226,22 @@ sanitize_restored_xstate(union fpregs_state *state, */ /* - * Init the state that is not present in the memory - * layout and not enabled by the OS. + * 'user_xfeatures' might have bits clear which are + * set in header->xfeatures. This represents features that + * were in init state prior to a signal delivery, and need + * to be reset back to the init state. Clear any user + * feature bits which are set in the kernel buffer to get + * them back to the init state. + * + * Supervisor state is unchanged by input from userspace. + * Ensure supervisor state bits stay set and supervisor + * state is not modified. */ if (fx_only) header->xfeatures = XFEATURE_MASK_FPSSE; else - header->xfeatures &= xfeatures; + header->xfeatures &= user_xfeatures | + xfeatures_mask_supervisor(); } if (use_fxsr()) { @@ -281,7 +290,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; - u64 xfeatures = 0; + u64 user_xfeatures = 0; int fx_only = 0; int ret = 0; @@ -314,7 +323,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; - xfeatures = fx_sw_user.xfeatures; + user_xfeatures = fx_sw_user.xfeatures; } } @@ -349,7 +358,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpregs_lock(); pagefault_disable(); - ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); pagefault_enable(); if (!ret) { fpregs_mark_activate(); @@ -362,7 +371,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask_user() & ~xfeatures; + u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; if (using_compacted_format()) { ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); @@ -375,12 +384,13 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (ret) goto err_out; - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, user_xfeatures); } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); @@ -389,7 +399,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) goto err_out; } - sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures, + fx_only); fpregs_lock(); if (use_xsave()) { -- cgit From 6255c161a08564e4f3995db31f3d64a5fd24738b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 15 May 2020 20:21:21 +0200 Subject: x86/nmi: Remove edac.h include leftover ... which db47d5f85646 ("x86/nmi, EDAC: Get rid of DRAM error reporting thru PCI SERR NMI") forgot to remove. No functional changes. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200515182246.3553-1-bp@alien8.de --- arch/x86/kernel/nmi.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 6407ea21fa1b..bdcc5146de96 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -25,10 +25,6 @@ #include #include -#if defined(CONFIG_EDAC) -#include -#endif - #include #include #include -- cgit From eeedf1533687b8e81865fdbde79eddf7c4b76c9a Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:42 -0700 Subject: x86/fpu: Introduce copy_supervisor_to_kernel() The XSAVES instruction takes a mask and saves only the features specified in that mask. The kernel normally specifies that all features be saved. XSAVES also unconditionally uses the "compacted format" which means that all specified features are saved next to each other in memory. If a feature is removed from the mask, all the features after it will "move up" into earlier locations in the buffer. Introduce copy_supervisor_to_kernel(), which saves only supervisor states and then moves those states into the standard location where they are normally found. Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200512145444.15483-9-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/xstate.c | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a68213ed5be6..587e03f0094d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -62,6 +62,7 @@ u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -392,6 +393,33 @@ static void __init setup_xstate_comp_offsets(void) } } +/* + * Setup offsets of a supervisor-state-only XSAVES buffer: + * + * The offsets stored in xstate_comp_offsets[] only work for one specific + * value of the Requested Feature BitMap (RFBM). In cases where a different + * RFBM value is used, a different set of offsets is required. This set of + * offsets is for when RFBM=xfeatures_mask_supervisor(). + */ +static void __init setup_supervisor_only_offsets(void) +{ + unsigned int next_offset; + int i; + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_supervisor_only_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + /* * Print out xstate component offsets and sizes */ @@ -790,6 +818,7 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); + setup_supervisor_only_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", @@ -1262,6 +1291,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } +/* + * Save only supervisor states to the kernel buffer. This blows away all + * old states, and is intended to be used only in __fpu__restore_sig(), where + * user states are restored from the user buffer. + */ +void copy_supervisor_to_kernel(struct xregs_state *xstate) +{ + struct xstate_header *header; + u64 max_bit, min_bit; + u32 lmask, hmask; + int err, i; + + if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (!xfeatures_mask_supervisor()) + return; + + max_bit = __fls(xfeatures_mask_supervisor()); + min_bit = __ffs(xfeatures_mask_supervisor()); + + lmask = xfeatures_mask_supervisor(); + hmask = xfeatures_mask_supervisor() >> 32; + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + if (WARN_ON_FPU(err)) + return; + + /* + * At this point, the buffer has only supervisor states and must be + * converted back to normal kernel format. + */ + header = &xstate->header; + header->xcomp_bv |= xfeatures_mask_all; + + /* + * This only moves states up in the buffer. Start with + * the last state and move backwards so that states are + * not overwritten until after they are moved. Note: + * memmove() allows overlapping src/dst buffers. + */ + for (i = max_bit; i >= min_bit; i--) { + u8 *xbuf = (u8 *)xstate; + + if (!((header->xfeatures >> i) & 1)) + continue; + + /* Move xfeature 'i' into its normal location */ + memmove(xbuf + xstate_comp_offsets[i], + xbuf + xstate_supervisor_only_offsets[i], + xstate_sizes[i]); + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 -- cgit From 98265c17efa9f2279c59262cd27679aca12e0bb8 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:43 -0700 Subject: x86/fpu/xstate: Preserve supervisor states for the slow path in __fpu__restore_sig() The signal return code is responsible for taking an XSAVE buffer present in user memory and loading it into the hardware registers. This operation only affects user XSAVE state and never affects supervisor state. The fast path through this code simply points XRSTOR directly at the user buffer. However, since user memory is not guaranteed to be always mapped, this XRSTOR can fail. If it fails, the signal return code falls back to a slow path which can tolerate page faults. That slow path copies the xfeatures one by one out of the user buffer into the task's fpu state area. However, by being in a context where it can handle page faults, the code can also schedule. The lazy-fpu-load code would think it has an up-to-date fpstate and would fail to save the supervisor state when scheduling the task out. When scheduling back in, it would likely restore stale supervisor state. To fix that, preserve supervisor state before the slow path. Modify copy_user_to_fpregs_zeroing() so that if it fails, fpregs are not zeroed, and there is no need for fpregs_deactivate() and supervisor states are preserved. Move set_thread_flag(TIF_NEED_FPU_LOAD) to the slow path. Without doing this, the fast path also needs supervisor states to be saved first. Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200512145444.15483-10-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/signal.c | 53 +++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 77e5c2e34ee6..6184fe70a9e1 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -262,19 +262,23 @@ sanitize_restored_user_xstate(union fpregs_state *state, static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { u64 init_bv; + int r; if (use_xsave()) { if (fx_only) { init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; - copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_fxregs(buf); + r = copy_user_to_fxregs(buf); + if (!r) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + return r; } else { init_bv = xfeatures_mask_user() & ~xbv; - if (unlikely(init_bv)) + r = copy_user_to_xregs(buf, xbv); + if (!r && unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - return copy_user_to_xregs(buf, xbv); + return r; } } else if (use_fxsr()) { return copy_user_to_fxregs(buf); @@ -327,28 +331,10 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } } - /* - * The current state of the FPU registers does not matter. By setting - * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate - * is not modified on context switch and that the xstate is considered - * to be loaded again on return to userland (overriding last_cpu avoids - * the optimisation). - */ - set_thread_flag(TIF_NEED_FPU_LOAD); - __fpu_invalidate_fpregs_state(fpu); - if ((unsigned long)buf_fx % 64) fx_only = 1; - /* - * For 32-bit frames with fxstate, copy the fxstate so it can be - * reconstructed later. - */ - if (ia32_fxstate) { - ret = __copy_from_user(&env, buf, sizeof(env)); - if (ret) - goto err_out; - envp = &env; - } else { + + if (!ia32_fxstate) { /* * Attempt to restore the FPU registers directly from user * memory. For that to succeed, the user access cannot cause @@ -365,10 +351,27 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_unlock(); return 0; } - fpregs_deactivate(fpu); fpregs_unlock(); + } else { + /* + * For 32-bit frames with fxstate, copy the fxstate so it can + * be reconstructed later. + */ + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; } + /* + * The current state of the FPU registers does not matter. By setting + * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate + * is not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + set_thread_flag(TIF_NEED_FPU_LOAD); + __fpu_invalidate_fpregs_state(fpu); if (use_xsave() && !fx_only) { u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; -- cgit From 55e00fb66fd5048f4a3ee357018fd26fc527abca Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:44 -0700 Subject: x86/fpu/xstate: Restore supervisor states for signal return The signal return fast path directly restores user states from the user buffer. Once that succeeds, restore supervisor states (but only when they are not yet restored). For the slow path, save supervisor states to preserve them across context switches, and restore after the user states are restored. The previous version has the overhead of an XSAVES in both the fast and the slow paths. It is addressed as the following: - In the fast path, only do an XRSTORS. - In the slow path, do a supervisor-state-only XSAVES, and relocate the buffer contents. Some thoughts in the implementation: - In the slow path, can any supervisor state become stale between save/restore? Answer: set_thread_flag(TIF_NEED_FPU_LOAD) protects the xstate buffer. - In the slow path, can any code reference a stale supervisor state register between save/restore? Answer: In the current lazy-restore scheme, any reference to xstate registers needs fpregs_lock()/fpregs_unlock() and __fpregs_load_activate(). - Are there other options? One other option is eagerly restoring all supervisor states. Currently, CET user-mode states and ENQCMD's PASID do not need to be eagerly restored. The upcoming CET kernel-mode states (24 bytes) need to be eagerly restored. To me, eagerly restoring all supervisor states adds more overhead then benefit at this point. Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20200512145444.15483-11-yu-cheng.yu@intel.com --- arch/x86/kernel/fpu/signal.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 6184fe70a9e1..9393a445d73c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -347,6 +347,23 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only); pagefault_enable(); if (!ret) { + + /* + * Restore supervisor states: previous context switch + * etc has done XSAVES and saved the supervisor states + * in the kernel buffer from which they can be restored + * now. + * + * We cannot do a single XRSTORS here - which would + * be nice - because the rest of the FPU registers are + * being restored from a user buffer directly. The + * single XRSTORS happens below, when the user buffer + * has been copied to the kernel one. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); fpregs_mark_activate(); fpregs_unlock(); return 0; @@ -364,14 +381,25 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } /* - * The current state of the FPU registers does not matter. By setting - * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate - * is not modified on context switch and that the xstate is considered + * By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is + * not modified on context switch and that the xstate is considered * to be loaded again on return to userland (overriding last_cpu avoids * the optimisation). */ - set_thread_flag(TIF_NEED_FPU_LOAD); + fpregs_lock(); + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + + /* + * Supervisor states are not modified by user space input. Save + * current supervisor states first and invalidate the FPU regs. + */ + if (xfeatures_mask_supervisor()) + copy_supervisor_to_kernel(&fpu->state.xsave); + set_thread_flag(TIF_NEED_FPU_LOAD); + } __fpu_invalidate_fpregs_state(fpu); + fpregs_unlock(); if (use_xsave() && !fx_only) { u64 init_bv = xfeatures_mask_user() & ~user_xfeatures; @@ -393,7 +421,13 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_lock(); if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); - ret = copy_kernel_to_xregs_err(&fpu->state.xsave, user_xfeatures); + + /* + * Restore previously saved supervisor xstates along with + * copied-in user xstates. + */ + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, + user_xfeatures | xfeatures_mask_supervisor()); } else if (use_fxsr()) { ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); -- cgit From b052df3da821adfd6be26a6eb16624fb50e90e56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Mar 2020 00:52:41 +0100 Subject: x86/entry: Get rid of ist_begin/end_non_atomic() This is completely overengineered and definitely not an interface which should be made available to anything else than this particular MCE case. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.462640294@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 6 ++++-- arch/x86/kernel/traps.c | 37 ------------------------------------- 2 files changed, 4 insertions(+), 39 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..98bf91cd7d5d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1352,13 +1352,15 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); local_irq_enable(); + preempt_enable(); if (kill_it || do_memory_failure(&m)) force_sig(SIGBUS); + preempt_disable(); local_irq_disable(); - ist_end_non_atomic(); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..6740e8351486 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -117,43 +117,6 @@ void ist_exit(struct pt_regs *regs) rcu_nmi_exit(); } -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) -{ - preempt_disable(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; -- cgit From 5567d11c21a1d508a91a8cb64a819783a0835d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 10:22:06 +0100 Subject: x86/mce: Send #MC singal from task work Convert #MC over to using task_work_add(); it will run the same code slightly later, on the return to user path of the same exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134100.957390899@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 56 +++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 98bf91cd7d5d..2f0ef95795f3 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1086,23 +1087,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1188,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1229,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1354,13 +1361,13 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if ((m.cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - local_irq_enable(); - preempt_enable(); - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - preempt_disable(); - local_irq_disable(); + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); @@ -1370,7 +1377,6 @@ out_ist: ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) -- cgit From 0d00449c7a28a1514595630735df383dec606812 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 09:46:43 +0100 Subject: x86: Replace ist_enter() with nmi_enter() A few exceptions (like #DB and #BP) can happen at any location in the code, this then means that tracers should treat events from these exceptions as NMI-like. The interrupted context could be holding locks with interrupts disabled for instance. Similarly, #MC is an actual NMI-like exception. All of them use ist_enter() which only concerns itself with RCU, but does not do any of the other setup that NMIs need. This means things like: printk() raw_spin_lock_irq(&logbuf_lock); <#DB/#BP/#MC> printk() raw_spin_lock_irq(&logbuf_lock); are entirely possible (well, not really since printk tries hard to play nice, but the concept stands). So replace ist_enter() with nmi_enter(). Also observe that any nmi_enter() caller must be both notrace and NOKPROBE, or in the noinstr text section. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.525508608@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 5 +-- arch/x86/kernel/cpu/mce/p5.c | 5 +-- arch/x86/kernel/cpu/mce/winchip.c | 5 +-- arch/x86/kernel/traps.c | 71 +++++++++------------------------------ 4 files changed, 24 insertions(+), 62 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2f0ef95795f3..e9265e2f28c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1266,7 +1267,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) if (__mc_check_crashing_cpu(cpu)) return; - ist_enter(regs); + nmi_enter(); this_cpu_inc(mce_exception_count); @@ -1374,7 +1375,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) } out_ist: - ist_exit(regs); + nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..5ee94aa1b766 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - ist_enter(regs); + nmi_enter(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +40,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..b3938c195365 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -18,12 +19,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - ist_enter(regs); + nmi_enter(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6740e8351486..f7cfb9d0ad02 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include #include #include +#include +#include + #include #include #include -#include #include #include #include @@ -82,41 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; @@ -326,7 +293,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && @@ -361,7 +328,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign } #endif - ist_enter(regs); + nmi_enter(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -555,19 +522,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from a kprobe in - * non-CONTEXT_KERNEL kernel mode or even during context tracking - * state changes. Make sure that we wake up RCU even if we're coming - * from kernel code. - * - * This means that we can't schedule even if we came from a - * preemptible kernel context. That's okay. + * Unlike any other non-IST entry, we can be called from pretty much + * any location in the kernel through kprobes -- text_poke() will most + * likely be handled by poke_int3_handler() above. This means this + * handler is effectively NMI-like. */ - if (!user_mode(regs)) { - rcu_nmi_enter(); - preempt_disable(); - } - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (!user_mode(regs)) + nmi_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -589,10 +550,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) { - preempt_enable_no_resched(); - rcu_nmi_exit(); - } + if (!user_mode(regs)) + nmi_exit(); } NOKPROBE_SYMBOL(do_int3); @@ -696,7 +655,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - ist_enter(regs); + nmi_enter(); get_debugreg(dr6, 6); /* @@ -789,7 +748,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs); + nmi_exit(); } NOKPROBE_SYMBOL(do_debug); -- cgit From ef68017eb5704eb2b0577c3aa6619e13caf2b59f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 28 Feb 2020 10:42:48 -0800 Subject: x86/kvm: Handle async page faults directly through do_page_fault() KVM overloads #PF to indicate two types of not-actually-page-fault events. Right now, the KVM guest code intercepts them by modifying the IDT and hooking the #PF vector. This makes the already fragile fault code even harder to understand, and it also pollutes call traces with async_page_fault and do_async_page_fault for normal page faults. Clean it up by moving the logic into do_page_fault() using a static branch. This gets rid of the platform trap_init override mechanism completely. [ tglx: Fixed up 32bit, removed error code from the async functions and massaged coding style ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.169270470@linutronix.de --- arch/x86/kernel/kvm.c | 39 +++++++++++++++++++++------------------ arch/x86/kernel/traps.c | 2 -- arch/x86/kernel/x86_init.c | 1 - 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6efe0410fb72..5ad3fcca2309 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -35,6 +35,8 @@ #include #include +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + static int kvmapf = 1; static int __init parse_no_kvmapf(char *arg) @@ -242,25 +244,27 @@ u32 kvm_read_and_reset_pf_reason(void) EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { + /* + * If we get a page fault right here, the pf_reason seems likely + * to be clobbered. Bummer. + */ switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code, address); - break; + return false; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - break; + kvm_async_pf_task_wait(token, !user_mode(regs)); + return true; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - kvm_async_pf_task_wake((u32)address); + kvm_async_pf_task_wake(token); rcu_irq_exit(); - break; + return true; } } -NOKPROBE_SYMBOL(do_async_page_fault); +NOKPROBE_SYMBOL(__kvm_handle_async_pf); static void __init paravirt_ops_setup(void) { @@ -306,7 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; + + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); #ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -592,12 +600,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu) } #endif -static void __init kvm_apf_trap_init(void) -{ - update_intr_gate(X86_TRAP_PF, async_page_fault); -} - - static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -632,8 +634,6 @@ static void __init kvm_guest_init(void) register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; @@ -649,6 +649,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + static_branch_enable(&kvm_async_pf_enabled); + #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..821fac47eef6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -983,7 +983,5 @@ void __init trap_init(void) idt_setup_ist_traps(); - x86_init.irqs.trap_init(); - idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 85f1a90c55cd..123f1c1f1788 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, - .trap_init = x86_init_noop, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, -- cgit From 6bca69ada4bc20fa27eb44a5e09da3363d1752af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Mar 2020 00:42:06 +0100 Subject: x86/kvm: Sanitize kvm_async_pf_task_wait() While working on the entry consolidation I stumbled over the KVM async page fault handler and kvm_async_pf_task_wait() in particular. It took me a while to realize that the randomly sprinkled around rcu_irq_enter()/exit() invocations are just cargo cult programming. Several patches "fixed" RCU splats by curing the symptoms without noticing that the code is flawed from a design perspective. The main problem is that this async injection is not based on a proper handshake mechanism and only respects the minimal requirement, i.e. the guest is not in a state where it has interrupts disabled. Aside of that the actual code is a convoluted one fits it all swiss army knife. It is invoked from different places with different RCU constraints: 1) Host side: vcpu_enter_guest() kvm_x86_ops->handle_exit() kvm_handle_page_fault() kvm_async_pf_task_wait() The invocation happens from fully preemptible context. 2) Guest side: The async page fault interrupted: a) user space b) preemptible kernel code which is not in a RCU read side critical section c) non-preemtible kernel code or a RCU read side critical section or kernel code with CONFIG_PREEMPTION=n which allows not to differentiate between #2b and #2c. RCU is watching for: #1 The vCPU exited and current is definitely not the idle task #2a The #PF entry code on the guest went through enter_from_user_mode() which reactivates RCU #2b There is no preemptible, interrupts enabled code in the kernel which can run with RCU looking away. (The idle task is always non preemptible). I.e. all schedulable states (#1, #2a, #2b) do not need any of this RCU voodoo at all. In #2c RCU is eventually not watching, but as that state cannot schedule anyway there is no point to worry about it so it has to invoke rcu_irq_enter() before running that code. This can be optimized, but this will be done as an extra step in course of the entry code consolidation work. So the proper solution for this is to: - Split kvm_async_pf_task_wait() into schedule and halt based waiting interfaces which share the enqueueing code. - Add comments (condensed form of this changelog) to spare others the time waste and pain of reverse engineering all of this with the help of uncomprehensible changelogs and code history. - Invoke kvm_async_pf_task_wait_schedule() from kvm_handle_page_fault(), user mode and schedulable kernel side async page faults (#1, #2a, #2b) - Invoke kvm_async_pf_task_wait_halt() for the non schedulable kernel case (#2c). For this case also remove the rcu_irq_exit()/enter() pair around the halt as it is just a pointless exercise: - vCPUs can VMEXIT at any random point and can be scheduled out for an arbitrary amount of time by the host and this is not any different except that it voluntary triggers the exit via halt. - The interrupted context could have RCU watching already. So the rcu_irq_exit() before the halt is not gaining anything aside of confusing the reader. Claiming that this might prevent RCU stalls is just an illusion. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.262701431@linutronix.de --- arch/x86/kernel/kvm.c | 201 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 141 insertions(+), 60 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5ad3fcca2309..c6a82f9f537f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,7 +75,7 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool halted; + bool use_halt; }; static struct kvm_task_sleep_head { @@ -98,75 +98,145 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, bool use_halt, + struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); + kfree(e); + return false; + } - rcu_irq_exit(); + n->token = token; + n->cpu = smp_processor_id(); + n->use_halt = use_halt; + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, false, &n)) return; + + for (;;) { + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + local_irq_enable(); + schedule(); + local_irq_disable(); } + finish_swait(&n.wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); - raw_spin_unlock(&b->lock); +/* + * Invoked from the async page fault handler. + */ +static void kvm_async_pf_task_wait_halt(u32 token) +{ + struct kvm_task_sleep_node n; + + if (!kvm_async_pf_queue_task(token, true, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; + /* + * No point in doing anything about RCU here. Any RCU read + * side critical section or RCU watching section can be + * interrupted by VMEXITs and the host is free to keep the + * vCPU scheduled out as long as it sees fit. This is not + * any different just because of the halt induced voluntary + * VMEXIT. + * + * Also the async page fault could have interrupted any RCU + * watching context, so invoking rcu_irq_exit()/enter() + * around this is not gaining anything. + */ + native_safe_halt(); + local_irq_disable(); + } +} - rcu_irq_exit(); +/* Invoked from the async page fault handler */ +static void kvm_async_pf_task_wait(u32 token, bool usermode) +{ + bool can_schedule; - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } + /* + * No need to check whether interrupts were disabled because the + * host will (hopefully) only inject an async page fault into + * interrupt enabled regions. + * + * If CONFIG_PREEMPTION is enabled then check whether the code + * which triggered the page fault is preemptible. This covers user + * mode as well because preempt_count() is obviously 0 there. + * + * The check for rcu_preempt_depth() is also required because + * voluntary scheduling inside a rcu read locked section is not + * allowed. + * + * The idle task is already covered by this because idle always + * has a preempt count > 0. + * + * If CONFIG_PREEMPTION is disabled only allow scheduling when + * coming from user mode as there is no indication whether the + * context which triggered the page fault could schedule or not. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + can_schedule = preempt_count() + rcu_preempt_depth() == 0; + else + can_schedule = usermode; + /* + * If the kernel context is allowed to schedule then RCU is + * watching because no preemptible code in the kernel is inside RCU + * idle state. So it can be treated like user mode. User mode is + * safe because the #PF entry invoked enter_from_user_mode(). + * + * For the non schedulable case invoke rcu_irq_enter() for + * now. This will be moved out to the pagefault entry code later + * and only invoked when really needed. + */ + if (can_schedule) { + kvm_async_pf_task_wait_schedule(token); + } else { rcu_irq_enter(); + kvm_async_pf_task_wait_halt(token); + rcu_irq_exit(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) + if (n->use_halt) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); @@ -177,12 +247,13 @@ static void apf_task_wake_all(void) int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -223,8 +294,9 @@ again: n->cpu = smp_processor_id(); init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); - } else + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); return; } @@ -246,23 +318,33 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - /* - * If we get a page fault right here, the pf_reason seems likely - * to be clobbered. Bummer. - */ - switch (kvm_read_and_reset_pf_reason()) { + u32 reason = kvm_read_and_reset_pf_reason(); + + switch (reason) { + case KVM_PV_REASON_PAGE_NOT_PRESENT: + case KVM_PV_REASON_PAGE_READY: + break; default: return false; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { /* page is swapped out by the host. */ - kvm_async_pf_task_wait(token, !user_mode(regs)); - return true; - case KVM_PV_REASON_PAGE_READY: + kvm_async_pf_task_wait(token, user_mode(regs)); + } else { rcu_irq_enter(); kvm_async_pf_task_wake(token); rcu_irq_exit(); - return true; } + return true; } NOKPROBE_SYMBOL(__kvm_handle_async_pf); @@ -326,12 +408,12 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -352,8 +434,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); } static void kvm_pv_guest_cpu_reboot(void *unused) -- cgit From 3a7c8fafd1b42adea229fd204132f6a2fb3cd2d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2020 09:57:56 +0200 Subject: x86/kvm: Restrict ASYNC_PF to user space The async page fault injection into kernel space creates more problems than it solves. The host has absolutely no knowledge about the state of the guest if the fault happens in CPL0. The only restriction for the host is interrupt disabled state. If interrupts are enabled in the guest then the exception can hit arbitrary code. The HALT based wait in non-preemotible code is a hacky replacement for a proper hypercall. For the ongoing work to restrict instrumentation and make the RCU idle interaction well defined the required extra work for supporting async pagefault in CPL0 is just not justified and creates complexity for a dubious benefit. The CPL3 injection is well defined and does not cause any issues as it is more or less the same as a regular page fault from CPL3. Suggested-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.369802541@linutronix.de --- arch/x86/kernel/kvm.c | 100 ++++---------------------------------------------- 1 file changed, 7 insertions(+), 93 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c6a82f9f537f..b3d9b0d7a37d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,7 +75,6 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool use_halt; }; static struct kvm_task_sleep_head { @@ -98,8 +97,7 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -static bool kvm_async_pf_queue_task(u32 token, bool use_halt, - struct kvm_task_sleep_node *n) +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -117,7 +115,6 @@ static bool kvm_async_pf_queue_task(u32 token, bool use_halt, n->token = token; n->cpu = smp_processor_id(); - n->use_halt = use_halt; init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); @@ -138,7 +135,7 @@ void kvm_async_pf_task_wait_schedule(u32 token) lockdep_assert_irqs_disabled(); - if (!kvm_async_pf_queue_task(token, false, &n)) + if (!kvm_async_pf_queue_task(token, &n)) return; for (;;) { @@ -154,91 +151,10 @@ void kvm_async_pf_task_wait_schedule(u32 token) } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); -/* - * Invoked from the async page fault handler. - */ -static void kvm_async_pf_task_wait_halt(u32 token) -{ - struct kvm_task_sleep_node n; - - if (!kvm_async_pf_queue_task(token, true, &n)) - return; - - for (;;) { - if (hlist_unhashed(&n.link)) - break; - /* - * No point in doing anything about RCU here. Any RCU read - * side critical section or RCU watching section can be - * interrupted by VMEXITs and the host is free to keep the - * vCPU scheduled out as long as it sees fit. This is not - * any different just because of the halt induced voluntary - * VMEXIT. - * - * Also the async page fault could have interrupted any RCU - * watching context, so invoking rcu_irq_exit()/enter() - * around this is not gaining anything. - */ - native_safe_halt(); - local_irq_disable(); - } -} - -/* Invoked from the async page fault handler */ -static void kvm_async_pf_task_wait(u32 token, bool usermode) -{ - bool can_schedule; - - /* - * No need to check whether interrupts were disabled because the - * host will (hopefully) only inject an async page fault into - * interrupt enabled regions. - * - * If CONFIG_PREEMPTION is enabled then check whether the code - * which triggered the page fault is preemptible. This covers user - * mode as well because preempt_count() is obviously 0 there. - * - * The check for rcu_preempt_depth() is also required because - * voluntary scheduling inside a rcu read locked section is not - * allowed. - * - * The idle task is already covered by this because idle always - * has a preempt count > 0. - * - * If CONFIG_PREEMPTION is disabled only allow scheduling when - * coming from user mode as there is no indication whether the - * context which triggered the page fault could schedule or not. - */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - can_schedule = preempt_count() + rcu_preempt_depth() == 0; - else - can_schedule = usermode; - - /* - * If the kernel context is allowed to schedule then RCU is - * watching because no preemptible code in the kernel is inside RCU - * idle state. So it can be treated like user mode. User mode is - * safe because the #PF entry invoked enter_from_user_mode(). - * - * For the non schedulable case invoke rcu_irq_enter() for - * now. This will be moved out to the pagefault entry code later - * and only invoked when really needed. - */ - if (can_schedule) { - kvm_async_pf_task_wait_schedule(token); - } else { - rcu_irq_enter(); - kvm_async_pf_task_wait_halt(token); - rcu_irq_exit(); - } -} - static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->use_halt) - smp_send_reschedule(n->cpu); - else if (swq_has_sleeper(&n->wq)) + if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } @@ -337,8 +253,10 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) panic("Host injected async #PF in interrupt disabled region\n"); if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { - /* page is swapped out by the host. */ - kvm_async_pf_task_wait(token, user_mode(regs)); + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); } else { rcu_irq_enter(); kvm_async_pf_task_wake(token); @@ -397,10 +315,6 @@ static void kvm_guest_cpu_init(void) WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - -#ifdef CONFIG_PREEMPTION - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif pa |= KVM_ASYNC_PF_ENABLED; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) -- cgit From 0e5e3d4461a22d739fb2284a6e313fb6cecf2871 Mon Sep 17 00:00:00 2001 From: Benjamin Thiel Date: Sat, 16 May 2020 14:38:16 +0200 Subject: x86/audit: Fix a -Wmissing-prototypes warning for ia32_classify_syscall() Lift the prototype of ia32_classify_syscall() into its own header. Signed-off-by: Benjamin Thiel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200516123816.2680-1-b.thiel@posteo.de --- arch/x86/kernel/audit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index e1efe44ebefc..83d9cad4e68b 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -3,6 +3,7 @@ #include #include #include +#include static unsigned dir_class[] = { #include @@ -41,7 +42,6 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_I386) return ia32_classify_syscall(syscall); #endif -- cgit From bd35c77e32e4359580207891c0f7a438ad4b42df Mon Sep 17 00:00:00 2001 From: Krzysztof Piecuch Date: Thu, 23 Jan 2020 16:09:26 +0000 Subject: x86/tsc: Add tsc_early_khz command line parameter Changing base clock frequency directly impacts TSC Hz but not CPUID.16h value. An overclocked CPU supporting CPUID.16h and with partial CPUID.15h support will set TSC KHZ according to "best guess" given by CPUID.16h relying on tsc_refine_calibration_work to give better numbers later. tsc_refine_calibration_work will refuse to do its work when the outcome is off the early TSC KHZ value by more than 1% which is certain to happen on an overclocked system. Fix this by adding a tsc_early_khz command line parameter that makes the kernel skip early TSC calibration and use the given value instead. This allows the user to provide the expected TSC frequency that is closer to reality than the one reported by the hardware, enabling tsc_refine_calibration_work to do meaningful error checking. [ tglx: Made the variable __initdata as it's only used on init and removed the error checking in the argument parser because kstrto*() only stores to the variable if the string is valid ] Signed-off-by: Krzysztof Piecuch Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/O2CpIOrqLZHgNRkfjRpz_LGqnc1ix_seNIiOCvHY4RHoulOVRo6kMXKuLOfBVTi0SMMevg6Go1uZ_cL9fLYtYdTRNH78ChaFaZyG3VAyYz8=@protonmail.com --- arch/x86/kernel/tsc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fdd4c1078632..49d925043171 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz); * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; +static unsigned int __initdata tsc_early_khz; static DEFINE_STATIC_KEY_FALSE(__use_tsc); @@ -59,6 +60,12 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); +static int __init tsc_early_khz_setup(char *buf) +{ + return kstrtouint(buf, 0, &tsc_early_khz); +} +early_param("tsc_early_khz", tsc_early_khz_setup); + __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); + if (tsc_early_khz) + tsc_khz = tsc_early_khz; + else + tsc_khz = x86_platform.calibrate_tsc(); } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); -- cgit From a4e91825d7e1252f7cba005f1451e5464b23c15d Mon Sep 17 00:00:00 2001 From: Alexander Monakov Date: Sun, 10 May 2020 20:48:40 +0000 Subject: x86/amd_nb: Add AMD family 17h model 60h PCI IDs Add PCI IDs for AMD Renoir (4000-series Ryzen CPUs). This is necessary to enable support for temperature sensors via the k10temp module. Signed-off-by: Alexander Monakov Signed-off-by: Borislav Petkov Acked-by: Yazen Ghannam Acked-by: Guenter Roeck Link: https://lkml.kernel.org/r/20200510204842.2603-2-amonakov@ispras.ru --- arch/x86/kernel/amd_nb.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index b6b3297851f3..18f6b7c4bd79 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -18,9 +18,11 @@ #define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 +#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 #define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 +#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c #define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 #define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 @@ -33,6 +35,7 @@ static const struct pci_device_id amd_root_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, {} }; @@ -50,6 +53,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, @@ -65,6 +69,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, -- cgit From 33649bf4494c1feaf1956a84895fcc0621aafd90 Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Wed, 13 May 2020 17:11:23 -0500 Subject: x86/apic/uv: Remove code for unused distributed GRU mode Distributed GRU mode appeared in only one generation of UV hardware, and no version of the BIOS has shipped with this feature enabled, and we have no plans to ever change that. The gru.s3.mode check has always been and will continue to be false. So remove this dead code. Signed-off-by: Steve Wahl Signed-off-by: Borislav Petkov Acked-by: Dimitri Sivanich Link: https://lkml.kernel.org/r/20200513221123.GJ3240@raspberrypi --- arch/x86/kernel/apic/x2apic_uv_x.c | 59 +------------------------------------- 1 file changed, 1 insertion(+), 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 10339ad02033..69e70ed0f5e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -30,8 +30,6 @@ static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; -static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; -static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; /* Unpack OEM/TABLE ID's to be NULL terminated strings */ @@ -83,20 +81,7 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr) static inline bool is_GRU_range(u64 start, u64 end) { - if (gru_dist_base) { - u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */ - u64 sl = start & gru_dist_lmask; /* Base offset bits */ - u64 eu = end & gru_dist_umask; - u64 el = end & gru_dist_lmask; - - /* Must reside completely within a single GRU range: */ - return (sl == gru_dist_base && el == gru_dist_base && - su >= gru_first_node_paddr && - su <= gru_last_node_paddr && - eu == su); - } else { - return start >= gru_start_paddr && end <= gru_end_paddr; - } + return start >= gru_start_paddr && end <= gru_end_paddr; } static bool uv_is_untracked_pat_range(u64 start, u64 end) @@ -797,42 +782,6 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift init_extra_mapping_wb(paddr, bytes); } -static __init void map_gru_distributed(unsigned long c) -{ - union uvh_rh_gam_gru_overlay_config_mmr_u gru; - u64 paddr; - unsigned long bytes; - int nid; - - gru.v = c; - - /* Only base bits 42:28 relevant in dist mode */ - gru_dist_base = gru.v & 0x000007fff0000000UL; - if (!gru_dist_base) { - pr_info("UV: Map GRU_DIST base address NULL\n"); - return; - } - - bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; - gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); - gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); - gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ - - for_each_online_node(nid) { - paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | - gru_dist_base; - init_extra_mapping_wb(paddr, bytes); - gru_first_node_paddr = min(paddr, gru_first_node_paddr); - gru_last_node_paddr = max(paddr, gru_last_node_paddr); - } - - /* Save upper (63:M) bits of address only for is_GRU_range */ - gru_first_node_paddr &= gru_dist_umask; - gru_last_node_paddr &= gru_dist_umask; - - pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); -} - static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; @@ -846,12 +795,6 @@ static __init void map_gru_high(int max_pnode) return; } - /* Only UV3 has distributed GRU mode */ - if (is_uv3_hub() && gru.s3.mode) { - map_gru_distributed(gru.v); - return; - } - base = (gru.v & mask) >> shift; map_high("GRU", base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)base << shift); -- cgit From 140fd4ac78d385e6c8e6a5757585f6c707085f87 Mon Sep 17 00:00:00 2001 From: Hill Ma Date: Sat, 25 Apr 2020 13:06:41 -0700 Subject: x86/reboot/quirks: Add MacBook6,1 reboot quirk On MacBook6,1 reboot would hang unless parameter reboot=pci is added. Make it automatic. Signed-off-by: Hill Ma Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200425200641.GA1554@cslab.localdomain --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3ca43be4f9cf..8b8cebfd3298 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -197,6 +197,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), }, }, + { /* Handle problems with rebooting on Apple MacBook6,1 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook6,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook6,1"), + }, + }, { /* Handle problems with rebooting on Apple MacBookPro5 */ .callback = set_pci_reboot, .ident = "Apple MacBookPro5", -- cgit From de308d1815c9e8fe602a958c5c76142ff6501d75 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 25 May 2020 12:38:39 +0200 Subject: x86/apic: Make TSC deadline timer detection message visible The commit c84cb3735fd5 ("x86/apic: Move TSC deadline timer debug printk") removed the message which said that the deadline timer was enabled. It added a pr_debug() message which is issued when deadline timer validation succeeds. Well, issued only when CONFIG_DYNAMIC_DEBUG is enabled - otherwise pr_debug() calls get optimized away if DEBUG is not defined in the compilation unit. Therefore, make the above message pr_info() so that it is visible in dmesg. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200525104218.27018-1-bp@alien8.de --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e53dda210cd7..21d2f1de1057 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2093,7 +2093,7 @@ void __init init_apic_mappings(void) unsigned int new_apicid; if (apic_validate_deadline_timer()) - pr_debug("TSC deadline timer available\n"); + pr_info("TSC deadline timer available\n"); if (x2apic_mode) { boot_cpu_physical_apicid = read_apic_id(); -- cgit From fd52a75ca3545c965ff58a78b6ff0b0dc7d8d228 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 8 May 2020 22:08:08 +0800 Subject: x86/io_apic: Remove unused function mp_init_irq_at_boot() There are no callers in-tree anymore since ef9e56d894ea ("x86/ioapic: Remove obsolete post hotplug update") so remove it. Signed-off-by: YueHaibing Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200508140808.49428-1-yuehaibing@huawei.com --- arch/x86/kernel/apic/io_apic.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 913c88617848..ce61e3e7d399 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -154,19 +154,6 @@ static inline bool mp_is_legacy_irq(int irq) return irq >= 0 && irq < nr_legacy_irqs(); } -/* - * Initialize all legacy IRQs and all pins on the first IOAPIC - * if we have legacy interrupt controller. Kernel boot option "pirq=" - * may rely on non-legacy pins on the first IOAPIC. - */ -static inline int mp_init_irq_at_boot(int ioapic, int irq) -{ - if (!nr_legacy_irqs()) - return 0; - - return ioapic == 0 || mp_is_legacy_irq(irq); -} - static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic) { return ioapics[ioapic].irqdomain; -- cgit From 003d80535180f74f262c40462b9fccd7f004901a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 May 2020 12:09:43 +0200 Subject: x86/apb_timer: Drop unused TSC calibration Drop the APB-timer TSC calibration, which hasn't been used since the removal of Moorestown support by commit 1a8359e411eb ("x86/mid: Remove Intel Moorestown"). Signed-off-by: Johan Hovold Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200513100944.9171-1-johan@kernel.org --- arch/x86/kernel/apb_timer.c | 53 --------------------------------------------- 1 file changed, 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index fe698f96617c..263eeaddb0aa 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -345,56 +345,3 @@ out_noapbt: apb_timer_block_enabled = 0; panic("failed to enable APB timer\n"); } - -/* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate(void) -{ - int i, scale; - u64 old, new; - u64 t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - dw_apb_clocksource_start(clocksource_apbt); - - /* check if the timer can count down, otherwise return */ - old = dw_apb_clocksource_read(clocksource_apbt); - i = 10000; - while (--i) { - if (old != dw_apb_clocksource_read(clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq / 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - dw_apb_clocksource_start(clocksource_apbt); - - old = dw_apb_clocksource_read(clocksource_apbt); - old += loop; - - t1 = rdtsc(); - - do { - new = dw_apb_clocksource_read(clocksource_apbt); - } while (new < old); - - t2 = rdtsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * (apbt_freq / 1000)) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; -failed: - return 0; -} -- cgit From 429ac8b75a0b1c3478ffd584de8a63075cbe25e7 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 30 Apr 2020 16:46:35 -0700 Subject: x86/split_lock: Add Icelake microserver and Tigerlake CPU models Icelake microserver CPU supports split lock detection while it doesn't have the split lock enumeration bit in IA32_CORE_CAPABILITIES. Tigerlake CPUs do enumerate the MSR. [ bp: Merge the two model-adding patches into one. ] Signed-off-by: Fenghua Yu Signed-off-by: Borislav Petkov Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1588290395-2677-1-git-send-email-fenghua.yu@intel.com --- arch/x86/kernel/cpu/intel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a19a680542ce..6abbcc774b82 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1135,9 +1135,12 @@ void switch_to_sld(unsigned long tifn) static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), {} }; -- cgit From 68fd66f100d196d35ab3008d4c69af3a0d7e7200 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:17 +0200 Subject: KVM: x86: extend struct kvm_vcpu_pv_apf_data with token info Currently, APF mechanism relies on the #PF abuse where the token is being passed through CR2. If we switch to using interrupts to deliver page-ready notifications we need a different way to pass the data. Extent the existing 'struct kvm_vcpu_pv_apf_data' with token information for page-ready notifications. While on it, rename 'reason' to 'flags'. This doesn't change the semantics as we only have reasons '1' and '2' and these can be treated as bit flags but KVM_PV_REASON_PAGE_READY is going away with interrupt based delivery making 'reason' name misleading. The newly introduced apf_put_user_ready() temporary puts both flags and token information, this will be changed to put token only when we switch to interrupt based notifications. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b3d9b0d7a37d..d6f22a3a1f7d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -218,23 +218,23 @@ again: } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); +NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_pf_reason(); + u32 reason = kvm_read_and_reset_apf_flags(); switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: -- cgit From 0348801151b5aefbcf9d6e9b9e30aceb3a2a7b13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:19 -0700 Subject: x86: fix vmap arguments in map_irq_stack vmap does not take a gfp_t, the flags argument is for VM_* flags. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra (Intel) Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-3-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/kernel/irq_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..6b32ab009c19 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; -- cgit From 7f0a002b5a21302d9f4b29ba83c96cd433ff3769 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:40 -0700 Subject: x86/mm: remove vmalloc faulting Remove fault handling on vmalloc areas, as the vmalloc code now takes care of synchronizing changes to all page-tables in the system. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-8-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup_percpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? -- cgit From c120f3b81ede0e3d2cf34d457d7c628306760ff1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Feb 2020 17:32:57 -0500 Subject: x86: switch cp_stat64() to unsafe_put_user() Signed-off-by: Al Viro --- arch/x86/kernel/sys_ia32.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index ab03fede1422..f8d65c99feb8 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -135,26 +135,30 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) typeof(ubuf->st_gid) gid = 0; SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid)); SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid)); - if (!access_ok(ubuf, sizeof(struct stat64)) || - __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || - __put_user(stat->ino, &ubuf->__st_ino) || - __put_user(stat->ino, &ubuf->st_ino) || - __put_user(stat->mode, &ubuf->st_mode) || - __put_user(stat->nlink, &ubuf->st_nlink) || - __put_user(uid, &ubuf->st_uid) || - __put_user(gid, &ubuf->st_gid) || - __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || - __put_user(stat->size, &ubuf->st_size) || - __put_user(stat->atime.tv_sec, &ubuf->st_atime) || - __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || - __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || - __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user(stat->blksize, &ubuf->st_blksize) || - __put_user(stat->blocks, &ubuf->st_blocks)) + if (!user_write_access_begin(ubuf, sizeof(struct stat64))) return -EFAULT; + unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault); + unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault); + unsafe_put_user(stat->ino, &ubuf->st_ino, Efault); + unsafe_put_user(stat->mode, &ubuf->st_mode, Efault); + unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault); + unsafe_put_user(uid, &ubuf->st_uid, Efault); + unsafe_put_user(gid, &ubuf->st_gid, Efault); + unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault); + unsafe_put_user(stat->size, &ubuf->st_size, Efault); + unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault); + unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault); + unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault); + unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault); + unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault); + unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault); + unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault); + unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault); + user_access_end(); return 0; +Efault: + user_write_access_end(); + return -EFAULT; } COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename, -- cgit From 0e96edd9a9c2e75ece7f581d4f75d26b38cd53ba Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 May 2020 20:11:21 -0700 Subject: x86/kvm: Remove defunct KVM_DEBUG_FS Kconfig Remove KVM_DEBUG_FS, which can easily be misconstrued as controlling KVM-as-a-host. The sole user of CONFIG_KVM_DEBUG_FS was removed by commit cfd8983f03c7b ("x86, locking/spinlocks: Remove ticket (spin)lock implementation"). Signed-off-by: Sean Christopherson Message-Id: <20200528031121.28904-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d6f22a3a1f7d..7e6403a8d861 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include -- cgit From 21998a351512eba4ed5969006f0c55882d995ada Mon Sep 17 00:00:00 2001 From: Anthony Steinhauser Date: Tue, 19 May 2020 06:40:42 -0700 Subject: x86/speculation: Avoid force-disabling IBPB based on STIBP and enhanced IBRS. When STIBP is unavailable or enhanced IBRS is available, Linux force-disables the IBPB mitigation of Spectre-BTB even when simultaneous multithreading is disabled. While attempts to enable IBPB using prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, ...) fail with EPERM, the seccomp syscall (or its prctl(PR_SET_SECCOMP, ...) equivalent) which are used e.g. by Chromium or OpenSSH succeed with no errors but the application remains silently vulnerable to cross-process Spectre v2 attacks (classical BTB poisoning). At the same time the SYSFS reporting (/sys/devices/system/cpu/vulnerabilities/spectre_v2) displays that IBPB is conditionally enabled when in fact it is unconditionally disabled. STIBP is useful only when SMT is enabled. When SMT is disabled and STIBP is unavailable, it makes no sense to force-disable also IBPB, because IBPB protects against cross-process Spectre-BTB attacks regardless of the SMT state. At the same time since missing STIBP was only observed on AMD CPUs, AMD does not recommend using STIBP, but recommends using IBPB, so disabling IBPB because of missing STIBP goes directly against AMD's advice: https://developer.amd.com/wp-content/resources/Architecture_Guidelines_Update_Indirect_Branch_Control.pdf Similarly, enhanced IBRS is designed to protect cross-core BTB poisoning and BTB-poisoning attacks from user space against kernel (and BTB-poisoning attacks from guest against hypervisor), it is not designed to prevent cross-process (or cross-VM) BTB poisoning between processes (or VMs) running on the same core. Therefore, even with enhanced IBRS it is necessary to flush the BTB during context-switches, so there is no reason to force disable IBPB when enhanced IBRS is available. Enable the prctl control of IBPB even when STIBP is unavailable or enhanced IBRS is available. Fixes: 7cc765a67d8e ("x86/speculation: Enable prctl mode for spectre_v2_user") Signed-off-by: Anthony Steinhauser Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/kernel/cpu/bugs.c | 87 ++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..8d57562b1d2c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -495,7 +495,9 @@ early_param("nospectre_v1", nospectre_v1_cmdline); static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; -static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = +static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = + SPECTRE_V2_USER_NONE; +static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_RETPOLINE @@ -641,15 +643,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) break; } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); @@ -672,23 +665,36 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); + + spectre_v2_user_ibpb = mode; } - /* If enhanced IBRS is enabled no STIBP required */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + /* + * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * required. + */ + if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* - * If SMT is not possible or STIBP is not available clear the STIBP - * mode. + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if (mode != SPECTRE_V2_USER_STRICT && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + mode = SPECTRE_V2_USER_STRICT_PREFERRED; + + /* + * If STIBP is not available, clear the STIBP mode. */ - if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) + if (!boot_cpu_has(X86_FEATURE_STIBP)) mode = SPECTRE_V2_USER_NONE; + + spectre_v2_user_stibp = mode; + set_mode: - spectre_v2_user = mode; - /* Only print the STIBP mode when SMT possible */ - if (smt_possible) - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { @@ -921,7 +927,7 @@ void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: @@ -1164,14 +1170,16 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; /* * Indirect branch speculation is always disabled in strict * mode. */ - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); @@ -1182,10 +1190,12 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always allowed when * mitigation is force disabled. */ - if (spectre_v2_user == SPECTRE_V2_USER_NONE) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; - if (spectre_v2_user == SPECTRE_V2_USER_STRICT || - spectre_v2_user == SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return 0; task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) @@ -1216,7 +1226,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); - if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif @@ -1247,22 +1258,24 @@ static int ib_prctl_get(struct task_struct *task) if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; - switch (spectre_v2_user) { - case SPECTRE_V2_USER_NONE: + if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && + spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; - case SPECTRE_V2_USER_PRCTL: - case SPECTRE_V2_USER_SECCOMP: + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + return PR_SPEC_DISABLE; + else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || + spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case SPECTRE_V2_USER_STRICT: - case SPECTRE_V2_USER_STRICT_PREFERRED: - return PR_SPEC_DISABLE; - default: + } else return PR_SPEC_NOT_AFFECTED; - } } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) @@ -1501,7 +1514,7 @@ static char *stibp_state(void) if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return ""; - switch (spectre_v2_user) { + switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: -- cgit From dbbe2ad02e9df26e372f38cc3e70dab9222c832e Mon Sep 17 00:00:00 2001 From: Anthony Steinhauser Date: Sun, 5 Jan 2020 12:19:43 -0800 Subject: x86/speculation: Prevent rogue cross-process SSBD shutdown On context switch the change of TIF_SSBD and TIF_SPEC_IB are evaluated to adjust the mitigations accordingly. This is optimized to avoid the expensive MSR write if not needed. This optimization is buggy and allows an attacker to shutdown the SSBD protection of a victim process. The update logic reads the cached base value for the speculation control MSR which has neither the SSBD nor the STIBP bit set. It then OR's the SSBD bit only when TIF_SSBD is different and requests the MSR update. That means if TIF_SSBD of the previous and next task are the same, then the base value is not updated, even if TIF_SSBD is set. The MSR write is not requested. Subsequently if the TIF_STIBP bit differs then the STIBP bit is updated in the base value and the MSR is written with a wrong SSBD value. This was introduced when the per task/process conditional STIPB switching was added on top of the existing SSBD switching. It is exploitable if the attacker creates a process which enforces SSBD and has the contrary value of STIBP than the victim process (i.e. if the victim process enforces STIBP, the attacker process must not enforce it; if the victim process does not enforce STIBP, the attacker process must enforce it) and schedule it on the same core as the victim process. If the victim runs after the attacker the victim becomes vulnerable to Spectre V4. To fix this, update the MSR value independent of the TIF_SSBD difference and dependent on the SSBD mitigation method available. This ensures that a subsequent STIPB initiated MSR write has the correct state of SSBD. [ tglx: Handle X86_FEATURE_VIRT_SSBD & X86_FEATURE_VIRT_SSBD correctly and massaged changelog ] Fixes: 5bfbe3ad5840 ("x86/speculation: Prepare for per task indirect branch speculation control") Signed-off-by: Anthony Steinhauser Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/kernel/process.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 35638f1c5791..8f4533c1a4ec 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -545,28 +545,20 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, lockdep_assert_irqs_disabled(); - /* - * If TIF_SSBD is different, select the proper mitigation - * method. Note that if SSBD mitigation is disabled or permanentely - * enabled this branch can't be taken because nothing can set - * TIF_SSBD. - */ - if (tif_diff & _TIF_SSBD) { - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + /* Handle change of TIF_SSBD depending on the mitigation method. */ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); - } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { + if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); - } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - static_cpu_has(X86_FEATURE_AMD_SSBD)) { - msr |= ssbd_tif_to_spec_ctrl(tifn); - updmsr = true; - } + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + updmsr |= !!(tif_diff & _TIF_SSBD); + msr |= ssbd_tif_to_spec_ctrl(tifn); } - /* - * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, - * otherwise avoid the MSR write. - */ + /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); -- cgit From 4d8df8cbb9156b0a0ab3f802b80cb5db57acc0bf Mon Sep 17 00:00:00 2001 From: Anthony Steinhauser Date: Sun, 7 Jun 2020 05:44:19 -0700 Subject: x86/speculation: PR_SPEC_FORCE_DISABLE enforcement for indirect branches. Currently, it is possible to enable indirect branch speculation even after it was force-disabled using the PR_SPEC_FORCE_DISABLE option. Moreover, the PR_GET_SPECULATION_CTRL command gives afterwards an incorrect result (force-disabled when it is in fact enabled). This also is inconsistent vs. STIBP and the documention which cleary states that PR_SPEC_FORCE_DISABLE cannot be undone. Fix this by actually enforcing force-disabled indirect branch speculation. PR_SPEC_ENABLE called after PR_SPEC_FORCE_DISABLE now fails with -EPERM as described in the documentation. Fixes: 9137bb27e60e ("x86/speculation: Add prctl() control for indirect branch speculation") Signed-off-by: Anthony Steinhauser Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/kernel/cpu/bugs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8d57562b1d2c..56f573aa764f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1175,11 +1175,14 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) return 0; /* * Indirect branch speculation is always disabled in strict - * mode. + * mode. It can neither be enabled if it was force-disabled + * by a previous prctl call. + */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || - spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) + spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED || + task_spec_ib_force_disable(task)) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); -- cgit From d8ad6d39c35d2b44b3d48b787df7f3359381dcbf Mon Sep 17 00:00:00 2001 From: Bob Haarman Date: Tue, 2 Jun 2020 12:30:59 -0700 Subject: x86_64: Fix jiffies ODR violation 'jiffies' and 'jiffies_64' are meant to alias (two different symbols that share the same address). Most architectures make the symbols alias to the same address via a linker script assignment in their arch//kernel/vmlinux.lds.S: jiffies = jiffies_64; which is effectively a definition of jiffies. jiffies and jiffies_64 are both forward declared for all architectures in include/linux/jiffies.h. jiffies_64 is defined in kernel/time/timer.c. x86_64 was peculiar in that it wasn't doing the above linker script assignment, but rather was: 1. defining jiffies in arch/x86/kernel/time.c instead via the linker script. 2. overriding the symbol jiffies_64 from kernel/time/timer.c in arch/x86/kernel/vmlinux.lds.s via 'jiffies_64 = jiffies;'. As Fangrui notes: In LLD, symbol assignments in linker scripts override definitions in object files. GNU ld appears to have the same behavior. It would probably make sense for LLD to error "duplicate symbol" but GNU ld is unlikely to adopt for compatibility reasons. This results in an ODR violation (UB), which seems to have survived thus far. Where it becomes harmful is when; 1. -fno-semantic-interposition is used: As Fangrui notes: Clang after LLVM commit 5b22bcc2b70d ("[X86][ELF] Prefer to lower MC_GlobalAddress operands to .Lfoo$local") defaults to -fno-semantic-interposition similar semantics which help -fpic/-fPIC code avoid GOT/PLT when the referenced symbol is defined within the same translation unit. Unlike GCC -fno-semantic-interposition, Clang emits such relocations referencing local symbols for non-pic code as well. This causes references to jiffies to refer to '.Ljiffies$local' when jiffies is defined in the same translation unit. Likewise, references to jiffies_64 become references to '.Ljiffies_64$local' in translation units that define jiffies_64. Because these differ from the names used in the linker script, they will not be rewritten to alias one another. 2. Full LTO Full LTO effectively treats all source files as one translation unit, causing these local references to be produced everywhere. When the linker processes the linker script, there are no longer any references to jiffies_64' anywhere to replace with 'jiffies'. And thus '.Ljiffies$local' and '.Ljiffies_64$local' no longer alias at all. In the process of porting patches enabling Full LTO from arm64 to x86_64, spooky bugs have been observed where the kernel appeared to boot, but init doesn't get scheduled. Avoid the ODR violation by matching other architectures and define jiffies only by linker script. For -fno-semantic-interposition + Full LTO, there is no longer a global definition of jiffies for the compiler to produce a local symbol which the linker script won't ensure aliases to jiffies_64. Fixes: 40747ffa5aa8 ("asmlinkage: Make jiffies visible") Reported-by: Nathan Chancellor Reported-by: Alistair Delva Debugged-by: Nick Desaulniers Debugged-by: Sami Tolvanen Suggested-by: Fangrui Song Signed-off-by: Bob Haarman Signed-off-by: Thomas Gleixner Tested-by: Sedat Dilek # build+boot on Reviewed-by: Andi Kleen Reviewed-by: Josh Poimboeuf Cc: stable@vger.kernel.org Link: https://github.com/ClangBuiltLinux/linux/issues/852 Link: https://lkml.kernel.org/r/20200602193100.229287-1-inglorion@google.com --- arch/x86/kernel/time.c | 4 ---- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..f39572982635 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -25,10 +25,6 @@ #include #include -#ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; -#endif - unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1bf7e312361f..7c35556c7827 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,13 +40,13 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) ENTRY(phys_startup_32) -jiffies = jiffies_64; #else OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) -jiffies_64 = jiffies; #endif +jiffies = jiffies_64; + #if defined(CONFIG_X86_64) /* * On 64-bit, align RODATA to 2MB so we retain large page mappings for -- cgit From d46b3df78ad4b4c178f1035a35463cbc0ce768b2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:57 -0700 Subject: x86: add missing const qualifiers for log_lvl Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Keep log_lvl const show_trace_log_lvl() and printk_stack_address() as the new generic show_stack_loglvl() wants to have a proper const qualifier. And gcc rightfully produces warnings in case it's not keept: arch/x86/kernel/dumpstack.c: In function `show_stack': arch/x86/kernel/dumpstack.c:294:37: warning: passing argument 4 of `show_trace_log_lv ' discards `const' qualifier from pointer target type [-Wdiscarded-qualifiers] 294 | show_trace_log_lvl(task, NULL, sp, loglvl); | ^~~~~~ arch/x86/kernel/dumpstack.c:163:32: note: expected `char *' but argument is of type `const char *' 163 | unsigned long *stack, char *log_lvl) | ~~~~~~^~~~~~~ [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-41-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/dumpstack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f..b94bc31a1757 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); @@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; -- cgit From a832ff02244e36da0bf4bb3a1aec0ce9a23b0bad Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:00 -0700 Subject: x86: add show_stack_loglvl() Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Introduce show_stack_loglvl(), that eventually will substitute show_stack(). [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-42-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/dumpstack.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b94bc31a1757..4396f2cfad19 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,8 @@ next: } } -void show_stack(struct task_struct *task, unsigned long *sp) +void show_stack_loglvl(struct task_struct *task, unsigned long *sp, + const char *loglvl) { task = task ? : current; @@ -290,7 +291,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); + show_trace_log_lvl(task, NULL, sp, loglvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_loglvl(task, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) -- cgit From 9ed5b01a36a0e40a7450b8a5caf82e0552c41bb3 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:13 -0700 Subject: x86/amd_gart: print stacktrace for a leak with KERN_ERR It's under CONFIG_IOMMU_LEAK option which is enabled by debug config. Likely the backtrace is worth to be seen - so aligning with log level of error message in iommu_full(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-46-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 16133819415c..9d2c076be37a 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack(NULL, NULL); + show_stack_loglvl(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif -- cgit From 9cb8f069deeed708bf19486d5893e297dc467ae0 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:32:29 -0700 Subject: kernel: rename show_stack_loglvl() => show_stack() Now the last users of show_stack() got converted to use an explicit log level, show_stack_loglvl() can drop it's redundant suffix and become once again well known show_stack(). Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200418201944.482088-51-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_gart_64.c | 2 +- arch/x86/kernel/dumpstack.c | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 9d2c076be37a..5f816861f5d2 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -159,7 +159,7 @@ static void dump_leak(void) return; dump = 1; - show_stack_loglvl(NULL, NULL, KERN_ERR); + show_stack(NULL, NULL, KERN_ERR); debug_dma_dump_mappings(NULL); } #endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 4396f2cfad19..456511b2284e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -279,7 +279,7 @@ next: } } -void show_stack_loglvl(struct task_struct *task, unsigned long *sp, +void show_stack(struct task_struct *task, unsigned long *sp, const char *loglvl) { task = task ? : current; @@ -294,11 +294,6 @@ void show_stack_loglvl(struct task_struct *task, unsigned long *sp, show_trace_log_lvl(task, NULL, sp, loglvl); } -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_loglvl(task, sp, KERN_DEFAULT); -} - void show_stack_regs(struct pt_regs *regs) { show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); -- cgit From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - 10 files changed, 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index cd617979b7fc..a9195ce8265d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 5f816861f5d2..17cb5b933dcf 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 3793646f0fb5..2ccc57f152a4 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 02bddfc122a4..64b00b0d7fe8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ad5cdd6a5f23..a29a44a98e5b 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 23c95a53d20e..34b153cbd4ac 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 538d4e8d6589..acfd6d2a0cbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -39,7 +39,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0c169a5687e1..9a97415b2139 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -40,7 +40,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f0e1ddbc2fd7..44130588987f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -28,7 +28,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b2942b2dbfcf..cbc0c82e55b7 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include -- cgit From ca5999fde0a1761665a38e4c9a72dbcd7d190a81 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:38 -0700 Subject: mm: introduce include/linux/pgtable.h The include/linux/pgtable.h is going to be the home of generic page table manipulation functions. Start with moving asm-generic/pgtable.h to include/linux/pgtable.h and make the latter include asm/pgtable.h. Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-3-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 2 +- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/crash_core_32.c | 2 +- arch/x86/kernel/crash_core_64.c | 2 +- arch/x86/kernel/early_printk.c | 2 +- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 683ed9e12e6b..2f04f4505166 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index ed3b04483972..c95a630f266e 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index cdf45b4700f2..5a58c85c22c7 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include "local.h" diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..f75a0cca1ab7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 74682b8d09b0..efdeaf21aa5f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 166d7c355896..1c00a443d6b9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c index c0159a7bca6d..dfa6ed2156fd 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/crash_core_32.c @@ -2,7 +2,7 @@ #include -#include +#include #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c index 845a57eb4eb7..15ddebde8741 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/crash_core_64.c @@ -2,7 +2,7 @@ #include -#include +#include #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 93fbdff2974f..dee2ea5223d8 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 12e7d4406c32..4fe7af58cfe1 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 206a4b6144c2..b01373e7db7c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4bbc770af632..a4147d80c9c5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 519649ddf100..33d56b7b8a4b 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 5aa523c2d573..6375df15e763 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4d7022a740ab..e939c606872f 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -45,7 +45,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index ea13f6888284..184402c0a3b1 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5638e4ae2ea6..90a94876c50f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 3ca43be4f9cf..8bfeb4213b4a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2467f3dd35d3..48caa588fd53 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include #include #include -- cgit From 65fddcfca8ad14778f71a57672fd01e8112d30fa Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:42 -0700 Subject: mm: reorder includes after introduction of linux/pgtable.h The replacement of with made the include of the latter in the middle of asm includes. Fix this up with the aid of the below script and manual adjustments here and there. import sys import re if len(sys.argv) is not 3: print "USAGE: %s
" % (sys.argv[0]) sys.exit(1) hdr_to_move="#include " % sys.argv[2] moved = False in_hdrs = False with open(sys.argv[1], "r") as f: lines = f.readlines() for _line in lines: line = _line.rstrip(' ') if line == hdr_to_move: continue if line.startswith("#include Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-4-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 2 +- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/crash_core_32.c | 2 +- arch/x86/kernel/crash_core_64.c | 2 +- arch/x86/kernel/early_printk.c | 2 +- arch/x86/kernel/head64.c | 2 +- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/i8259.c | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/kernel/kprobes/opt.c | 2 +- arch/x86/kernel/paravirt.c | 2 +- arch/x86/kernel/reboot.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 2f04f4505166..7bdc0239a943 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -20,11 +20,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index c95a630f266e..cc1fea76aab0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,9 +10,9 @@ #include #include #include +#include #include #include -#include #include #include diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 5a58c85c22c7..35edd57f064a 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -12,11 +12,11 @@ */ #include #include +#include #include #include -#include #include "local.h" diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index f75a0cca1ab7..c901b086bbdb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index efdeaf21aa5f..b367df8f54e7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1c00a443d6b9..63926c94eb5f 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include @@ -11,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c index dfa6ed2156fd..8a89c109e20a 100644 --- a/arch/x86/kernel/crash_core_32.c +++ b/arch/x86/kernel/crash_core_32.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include - #include + #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c index 15ddebde8741..7d255f882afe 100644 --- a/arch/x86/kernel/crash_core_64.c +++ b/arch/x86/kernel/crash_core_64.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only #include - #include + #include void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index dee2ea5223d8..d3c531d3b244 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -15,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b01373e7db7c..cbb71c1b574f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -20,13 +20,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a4147d80c9c5..4fc33fdf0f16 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 33d56b7b8a4b..f3c76252247d 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,11 +15,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6375df15e763..dd73135d7cee 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,11 +16,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index e939c606872f..85de8fa69b24 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -41,11 +41,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 184402c0a3b1..234f58e0fe8c 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -16,11 +16,11 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 90a94876c50f..674a7d66d960 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -13,13 +13,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 8bfeb4213b4a..e040ba6be27b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -11,13 +11,13 @@ #include #include #include +#include #include #include #include #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 48caa588fd53..ffbd9a3d78d8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -63,7 +64,6 @@ #include #include #include -#include #include #include #include -- cgit From d8ed45c5dcd455fc5848d47f86883a1b872ac0d0 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:25 -0700 Subject: mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 47a8676c7395..764573de3996 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -171,7 +171,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_t *pte; int i; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -197,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) } pte_unmap_unlock(pte, ptl); out: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false); } -- cgit From 14c3656b7284a8649496584869e8c6642ec1abbb Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:40 -0700 Subject: mmap locking API: add MMAP_LOCK_INITIALIZER Define a new initializer for the mmap locking api. Initially this just evaluates to __RWSEM_INITIALIZER as the API is defined as wrappers around rwsem. Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Laurent Dufour Reviewed-by: Vlastimil Babka Reviewed-by: Daniel Jordan Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-9-walken@google.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/tboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index cbc0c82e55b7..992fb1415c0f 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,7 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), - .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), }; -- cgit From c1e8d7c6a7a682e1405e3e242d32fc377fd196ff Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jun 2020 21:33:54 -0700 Subject: mmap locking API: convert mmap_sem comments Convert comments that reference mmap_sem to reference mmap_lock instead. [akpm@linux-foundation.org: fix up linux-next leftovers] [akpm@linux-foundation.org: s/lockaphore/lock/, per Vlastimil] [akpm@linux-foundation.org: more linux-next fixups, per Michel] Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Reviewed-by: Vlastimil Babka Reviewed-by: Daniel Jordan Cc: Davidlohr Bueso Cc: David Rientjes Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Laurent Dufour Cc: Liam Howlett Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Ying Han Link: http://lkml.kernel.org/r/20200520052908.204642-13-walken@google.com Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 6 +++--- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 6 +++--- arch/x86/kernel/ldt.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 4bd28b388a1a..0daf2f1cf7a8 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1326,9 +1326,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) * pseudo-locked region will still be here on return. * * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_sem semaphore which is obtained in - * the device_create() and debugfs_create_dir() callpath below - * as well as before the mmap() callback is called. + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. */ mutex_unlock(&rdtgroup_mutex); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index d7cb5ab0d1f0..23b4b61319d3 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -3199,10 +3199,10 @@ int __init rdtgroup_init(void) * during the debugfs directory creation also &sb->s_type->i_mutex_key * (the lockdep class of inode->i_rwsem). Other filesystem * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_sem - * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex * is taken, thus creating dependency: - * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause * issues considering the other two lock dependencies. * By creating the debugfs directory here we avoid a dependency * that may cause deadlock (even though file operations cannot diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 84c3ba32f211..8748321c4486 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -8,7 +8,7 @@ * * Lock order: * contex.ldt_usr_sem - * mmap_sem + * mmap_lock * context.lock */ -- cgit From fbaed278a3cc72a46aadae667b8c6754b78640a6 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 19 Apr 2020 14:40:48 +0000 Subject: x86/idt: Remove address operator on function machine_check() machine_check is function address, the address operator on it is nop for compiler. Make it consistent with the other function addresses in the same file. Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200419144049.1906-3-laijs@linux.alibaba.com --- arch/x86/kernel/idt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87ef69a72c52..98bcb502f967 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -93,7 +93,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DB, debug), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, &machine_check), + INTG(X86_TRAP_MC, machine_check), #endif SYSG(X86_TRAP_OF, overflow), @@ -186,7 +186,7 @@ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, machine_check, IST_INDEX_MCE), #endif }; -- cgit From 06184325a1cce27a02a688d98740f90fe06e0133 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 28 Apr 2020 11:38:23 +0200 Subject: x86/idt: Annotate alloc_intr_gate() with __init There seems to be no reason to allocate interrupt gates after init. Mark alloc_intr_gate() as __init and add WARN_ON() checks making sure it is only used before idt_setup_apic_and_irq_gates() finalizes IDT setup and maps all un-allocated entries to spurious entries. Suggested-by: Thomas Gleixner Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200428093824.1451532-3-vkuznets@redhat.com --- arch/x86/kernel/idt.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 98bcb502f967..0e9205137de8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -51,6 +51,9 @@ struct idt_data { #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) + +static bool idt_setup_done __initdata; + /* * Early traps running on the DEFAULT_STACK because the other interrupt * stacks work only after cpu_init(). @@ -323,6 +326,7 @@ void __init idt_setup_apic_and_irq_gates(void) set_intr_gate(i, entry); } #endif + idt_setup_done = true; } /** @@ -352,6 +356,7 @@ void idt_invalidate(void *addr) load_idt(&idt); } +/* This goes away once ASYNC_PF is sanitized */ void __init update_intr_gate(unsigned int n, const void *addr) { if (WARN_ON_ONCE(!test_bit(n, system_vectors))) @@ -359,9 +364,14 @@ void __init update_intr_gate(unsigned int n, const void *addr) set_intr_gate(n, addr); } -void alloc_intr_gate(unsigned int n, const void *addr) +void __init alloc_intr_gate(unsigned int n, const void *addr) { - BUG_ON(n < FIRST_SYSTEM_VECTOR); - if (!test_and_set_bit(n, system_vectors)) + if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) + return; + + if (WARN_ON(idt_setup_done)) + return; + + if (!WARN_ON(test_and_set_bit(n, system_vectors))) set_intr_gate(n, addr); } -- cgit From 1f1fbc70c10e81f70e9fbe2102d439c883269811 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 28 Apr 2020 11:38:24 +0200 Subject: x86/idt: Keep spurious entries unset in system_vectors With commit dc20b2d52653 ("x86/idt: Move interrupt gate initialization to IDT code") non assigned system vectors are also marked as used in 'used_vectors' (now 'system_vectors') bitmap. This makes checks in arch_show_interrupts() whether a particular system vector is allocated to always pass and e.g. 'Hyper-V reenlightenment interrupts' entry always shows up in /proc/interrupts. Another side effect of having all unassigned system vectors marked as used is that irq_matrix_debug_show() will wrongly count them among 'System' vectors. As it is now ensured that alloc_intr_gate() is not called after init, it is possible to leave unused entries in 'system_vectors' unset to fix these issues. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200428093824.1451532-4-vkuznets@redhat.com --- arch/x86/kernel/idt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0e9205137de8..36fef90a38e7 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -321,7 +321,11 @@ void __init idt_setup_apic_and_irq_gates(void) #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { - set_bit(i, system_vectors); + /* + * Don't set the non assigned system vectors in the + * system_vectors bitmap. Otherwise they show up in + * /proc/interrupts. + */ entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); set_intr_gate(i, entry); } -- cgit From 24ae0c91cbc57c2deb0401bd653453a508acdcee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 24 Feb 2020 13:24:58 +0100 Subject: x86/hw_breakpoint: Prevent data breakpoints on cpu_entry_area A data breakpoint near the top of an IST stack will cause unrecoverable recursion. A data breakpoint on the GDT, IDT, or TSS is terrifying. Prevent either of these from happening. Co-developed-by: Peter Zijlstra Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134058.272448010@linutronix.de --- arch/x86/kernel/hw_breakpoint.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d8d53ed02c9..d42fc0eaf193 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -227,10 +227,35 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range from addr to end, inclusive, overlaps the CPU + * entry area range. + */ +static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) +{ + return end >= CPU_ENTRY_AREA_BASE && + addr < (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_TOTAL_SIZE); +} + static int arch_build_bp_info(struct perf_event *bp, const struct perf_event_attr *attr, struct arch_hw_breakpoint *hw) { + unsigned long bp_end; + + bp_end = attr->bp_addr + attr->bp_len - 1; + if (bp_end < attr->bp_addr) + return -EINVAL; + + /* + * Prevent any breakpoint of any type that overlaps the + * cpu_entry_area. This protects the IST stacks and also + * reduces the chance that we ever find out what happens if + * there's a data breakpoint on the GDT, IDT, or TSS. + */ + if (within_cpu_entry_area(attr->bp_addr, bp_end)) + return -EINVAL; + hw->address = attr->bp_addr; hw->mask = 0; -- cgit From e9660391d0ebd174b169af3d6de680c2f836027c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Feb 2020 13:17:27 +0100 Subject: x86/doublefault: Remove memmove() call Use of memmove() in #DF is problematic considered tracing and other instrumentation. Remove the memmove() call and simply write out what needs doing; this even clarifies the code, win-win! The code copies from the espfix64 stack to the normal task stack, there is no possible way for that to overlap. Survives selftests/x86, specifically sigreturn_64. Suggested-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134058.863038566@linutronix.de --- arch/x86/kernel/traps.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4cc541051994..48468f61202c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -299,6 +299,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign regs->ip == (unsigned long)native_irq_return_iret) { struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + unsigned long *p = (unsigned long *)regs->sp; /* * regs->sp points to the failing IRET frame on the @@ -306,7 +307,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * in gpregs->ss through gpregs->ip. * */ - memmove(&gpregs->ip, (void *)regs->sp, 5*8); + gpregs->ip = p[0]; + gpregs->cs = p[1]; + gpregs->flags = p[2]; + gpregs->sp = p[3]; + gpregs->ss = p[4]; gpregs->orig_ax = 0; /* Missing (lost) #GP error code */ /* -- cgit From fba8dbeaf30e2c8db2c2cfeb38f6dbffcbf86bba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 May 2020 17:39:05 +0200 Subject: x86/idt: Remove update_intr_gate() No more users. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/idt.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 36fef90a38e7..95609ee4c8b3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -360,14 +360,6 @@ void idt_invalidate(void *addr) load_idt(&idt); } -/* This goes away once ASYNC_PF is sanitized */ -void __init update_intr_gate(unsigned int n, const void *addr) -{ - if (WARN_ON_ONCE(!test_bit(n, system_vectors))) - return; - set_intr_gate(n, addr); -} - void __init alloc_intr_gate(unsigned int n, const void *addr) { if (WARN_ON(n < FIRST_SYSTEM_VECTOR)) -- cgit From b9f6976bfb949121bb6e1e6f4fd9909735729148 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 19:45:26 +0100 Subject: x86/entry/64: Move non entry code into .text section All ASM code which is not part of the entry functionality can move out into the .text section. No reason to keep it in the non-instrumentable entry section. Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134340.227579223@linutronix.de --- arch/x86/kernel/ftrace_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index aa5d28aeb31e..083a3da7bb73 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -12,7 +12,7 @@ #include .code64 - .section .entry.text, "ax" + .section .text, "ax" #ifdef CONFIG_FRAME_POINTER /* Save parent and function stack frames (rip and rbp) */ -- cgit From d73a332936a6d33be3aa3fa4bee959efab09e431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 19:53:38 +0100 Subject: x86/traps: Mark fixup_bad_iret() noinstr This is called from deep entry ASM in a situation where instrumentation will cause more harm than providing useful information. Switch from memmove() to memcpy() because memmove() can't be called from noinstr code. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Masami Hiramatsu Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.346741553@linutronix.de --- arch/x86/kernel/traps.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 48468f61202c..b2b36561a569 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -578,7 +578,7 @@ struct bad_iret_stack { struct pt_regs regs; }; -asmlinkage __visible notrace +asmlinkage __visible noinstr struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) { /* @@ -589,19 +589,21 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack *new_stack = - (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + struct bad_iret_stack tmp, *new_stack = + (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; - /* Copy the IRET target to the new stack. */ - memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8); + /* Copy the IRET target to the temporary storage. */ + memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); + memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + + /* Update the entry stack */ + memcpy(new_stack, &tmp, sizeof(tmp)); BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } -NOKPROBE_SYMBOL(fixup_bad_iret); #endif static bool is_sysenter_singlestep(struct pt_regs *regs) -- cgit From daf7a69787b587d454adea73377a904e09fd54a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2020 23:47:51 +0100 Subject: x86/traps: Mark sync_regs() noinstr Replace the notrace and NOKPROBE annotations with noinstr. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Reviewed-by: Masami Hiramatsu Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.439765290@linutronix.de --- arch/x86/kernel/traps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b2b36561a569..adcc62380ece 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -564,14 +564,13 @@ NOKPROBE_SYMBOL(do_int3); * to switch to the normal thread stack if the interrupted code was in * user mode. The actual stack switch is done in entry_64.S */ -asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1; if (regs != eregs) *regs = *eregs; return regs; } -NOKPROBE_SYMBOL(sync_regs); struct bad_iret_stack { void *error_entry_ret; -- cgit From ca4c6a9858c23b4f330113f391f2eadc983e780f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Oct 2019 14:27:10 +0200 Subject: x86/traps: Make interrupt enable/disable symmetric in C code Traps enable interrupts conditionally but rely on the ASM return code to disable them again. That results in redundant interrupt disable and trace calls. Make the trap handlers disable interrupts before returning to avoid that, which allows simplification of the ASM entry code in follow up changes. Originally-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134903.622702796@linutronix.de --- arch/x86/kernel/traps.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index adcc62380ece..f5f4a76fb516 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -201,6 +201,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, NOTIFY_STOP) { cond_local_irq_enable(regs); do_trap(trapnr, signr, str, regs, error_code, sicode, addr); + cond_local_irq_disable(regs); } } @@ -397,6 +398,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) die("bounds", regs, error_code); do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + + cond_local_irq_disable(regs); } enum kernel_gp_hint { @@ -456,12 +459,13 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) if (static_cpu_has(X86_FEATURE_UMIP)) { if (user_mode(regs) && fixup_umip_exception(regs)) - return; + goto exit; } if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + local_irq_disable(); return; } @@ -473,12 +477,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) show_signal(tsk, SIGSEGV, "", desc, regs, error_code); force_sig(SIGSEGV); - - return; + goto exit; } if (fixup_exception(regs, X86_TRAP_GP, error_code, 0)) - return; + goto exit; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -490,11 +493,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) if (!preemptible() && kprobe_running() && kprobe_fault_handler(regs, X86_TRAP_GP)) - return; + goto exit; ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV); if (ret == NOTIFY_STOP) - return; + goto exit; if (error_code) snprintf(desc, sizeof(desc), "segment-related " GPFSTR); @@ -516,6 +519,8 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) die_addr(desc, regs, error_code, gp_addr); +exit: + cond_local_irq_disable(regs); } NOKPROBE_SYMBOL(do_general_protection); @@ -773,7 +778,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (!user_mode(regs)) { if (fixup_exception(regs, trapnr, error_code, 0)) - return; + goto exit; task->thread.error_code = error_code; task->thread.trap_nr = trapnr; @@ -781,7 +786,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, error_code); - return; + goto exit; } /* @@ -795,10 +800,12 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ if (!si_code) - return; + goto exit; force_sig_fault(SIGFPE, si_code, (void __user *)uprobe_get_trap_addr(regs)); +exit: + cond_local_irq_disable(regs); } dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) @@ -853,6 +860,8 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + + cond_local_irq_disable(regs); return; } #endif @@ -883,6 +892,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, ILL_BADSTK, (void __user *)NULL); } + local_irq_disable(); } #endif -- cgit From 218e31b6e7a33c9b5e5d608aa79d51665bb84e62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:13 +0100 Subject: x86/traps: Prepare for using DEFINE_IDTENTRY Prepare for using IDTENTRY to define the C exception/trap entry points. It would be possible to glue this into the existing macro maze, but it's simpler and better to read at the end to just make them distinct. Provide a trivial inline helper to read the trap address and add a comment explaining the logic behind it. The existing macros will be removed once all instances are converted. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.556327833@linutronix.de --- arch/x86/kernel/traps.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f5f4a76fb516..3857c0fd3306 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -205,6 +205,21 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, } } +/* + * Posix requires to provide the address of the faulting instruction for + * SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t. + * + * This address is usually regs->ip, but when an uprobe moved the code out + * of line then regs->ip points to the XOL code which would confuse + * anything which analyzes the fault address vs. the unmodified binary. If + * a trap happened in XOL code then uprobe maps regs->ip back to the + * original instruction address. + */ +static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) +{ + return (void __user *)uprobe_get_trap_addr(regs); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -- cgit From 9d06c4027f21fcfa60221bd7203eda3c82568467 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:14 +0100 Subject: x86/entry: Convert Divide Error to IDTENTRY Convert #DE to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134904.663914713@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 95609ee4c8b3..f2a751b10a01 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -73,7 +73,7 @@ static const __initconst struct idt_data early_idts[] = { * set up TSS. */ static const __initconst struct idt_data def_idts[] = { - INTG(X86_TRAP_DE, divide_error), + INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, bounds), INTG(X86_TRAP_UD, invalid_op), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3857c0fd3306..37092f74ec42 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -220,6 +220,12 @@ static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs) return (void __user *)uprobe_get_trap_addr(regs); } +DEFINE_IDTENTRY(exc_divide_error) +{ + do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE, + FPE_INTDIV, error_get_trap_addr(regs)); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -227,7 +233,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) -- cgit From 4b6b9111c0b9aa4c3b319f1c5a3b1d5850792167 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:15 +0100 Subject: x86/entry: Convert Overflow exception to IDTENTRY Convert #OF to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.771457898@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f2a751b10a01..f8d79629535e 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -99,7 +99,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_MC, machine_check), #endif - SYSG(X86_TRAP_OF, overflow), + SYSG(X86_TRAP_OF, asm_exc_overflow), #if defined(CONFIG_IA32_EMULATION) SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat), #elif defined(CONFIG_X86_32) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 37092f74ec42..b522e2aa9e9a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -226,6 +226,11 @@ DEFINE_IDTENTRY(exc_divide_error) FPE_INTDIV, error_get_trap_addr(regs)); } +DEFINE_IDTENTRY(exc_overflow) +{ + do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -233,7 +238,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow) DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) -- cgit From 58d9c81facf55dbd1836d114ce360a048e3a0582 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:17 +0100 Subject: x86/entry: Convert Bounds exception to IDTENTRY Convert #BR to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.863001309@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f8d79629535e..87583b69cbc2 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -75,7 +75,7 @@ static const __initconst struct idt_data early_idts[] = { static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), - INTG(X86_TRAP_BR, bounds), + INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, invalid_op), INTG(X86_TRAP_NM, device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b522e2aa9e9a..7a9fb8b9e1a8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -410,18 +410,17 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign panic("Machine halted."); } -dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_bounds) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, "bounds", regs, error_code, + if (notify_die(DIE_TRAP, "bounds", regs, 0, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; cond_local_irq_enable(regs); if (!user_mode(regs)) - die("bounds", regs, error_code); + die("bounds", regs, 0); - do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL); cond_local_irq_disable(regs); } -- cgit From 49893c5cb281f8691dcbe53e6f85a963f47a4b9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:18 +0100 Subject: x86/entry: Convert Invalid Opcode exception to IDTENTRY Convert #UD to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Fixup the FOOF bug call in fault.c - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134904.955511913@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 87583b69cbc2..8b48f54aeff6 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -76,7 +76,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, asm_exc_bounds), - INTG(X86_TRAP_UD, invalid_op), + INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7a9fb8b9e1a8..71ac43dc036a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -231,6 +231,21 @@ DEFINE_IDTENTRY(exc_overflow) do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL); } +#ifdef CONFIG_X86_F00F_BUG +void handle_invalid_op(struct pt_regs *regs) +#else +static inline void handle_invalid_op(struct pt_regs *regs) +#endif +{ + do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL, + ILL_ILLOPN, error_get_trap_addr(regs)); +} + +DEFINE_IDTENTRY(exc_invalid_op) +{ + handle_invalid_op(regs); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -238,7 +253,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) -- cgit From 866ae2ccee4ac092fea14f18d537205e14c5a904 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:19 +0100 Subject: x86/entry: Convert Device not available exception to IDTENTRY Convert #NM to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.056243863@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8b48f54aeff6..cdc2d8bbd338 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -77,7 +77,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NMI, nmi), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), - INTG(X86_TRAP_NM, device_not_available), + INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), INTG(X86_TRAP_NP, segment_not_present), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 71ac43dc036a..b8af5eb6a929 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -882,13 +882,10 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) */ } -dotraplinkage void -do_device_not_available(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_device_not_available) { unsigned long cr0 = read_cr0(); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) { struct math_emu_info info = { }; @@ -913,10 +910,9 @@ do_device_not_available(struct pt_regs *regs, long error_code) * to kill the task than getting stuck in a never-ending * loop of #NM faults. */ - die("unexpected #NM exception", regs, error_code); + die("unexpected #NM exception", regs, 0); } } -NOKPROBE_SYMBOL(do_device_not_available); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) -- cgit From f95658fdb575233f79e3e7ed7ecf990389d31319 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:20 +0100 Subject: x86/entry: Convert Coprocessor segment overrun exception to IDTENTRY Convert #OLD_MF to IDTENTRY: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.838823510@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index cdc2d8bbd338..758d325103e8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -78,7 +78,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), - INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun), + INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, invalid_TSS), INTG(X86_TRAP_NP, segment_not_present), INTG(X86_TRAP_SS, stack_segment), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b8af5eb6a929..3ce1f667d078 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -246,6 +246,12 @@ DEFINE_IDTENTRY(exc_invalid_op) handle_invalid_op(regs); } +DEFINE_IDTENTRY(exc_coproc_segment_overrun) +{ + do_error_trap(regs, 0, "coprocessor segment overrun", + X86_TRAP_OLD_MF, SIGFPE, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -253,7 +259,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -- cgit From 97b3d290b865cf9115f7d37d40b7482efba4d46d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:22 +0100 Subject: x86/entry: Convert Invalid TSS exception to IDTENTRY Convert #TS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.350676449@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 758d325103e8..caa740df1404 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -79,7 +79,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), - INTG(X86_TRAP_TS, invalid_TSS), + INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, segment_not_present), INTG(X86_TRAP_SS, stack_segment), INTG(X86_TRAP_GP, general_protection), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3ce1f667d078..10ab0837668f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -252,6 +252,12 @@ DEFINE_IDTENTRY(exc_coproc_segment_overrun) X86_TRAP_OLD_MF, SIGFPE, 0, NULL); } +DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) +{ + do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV, + 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -259,7 +265,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS) DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) #undef IP -- cgit From 99a3fb8d01af1085f16a417a748e0a462dc92d29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:23 +0100 Subject: x86/entry: Convert Segment not present exception to IDTENTRY Convert #NP to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134905.443591450@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index caa740df1404..b9acc7f5684a 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -80,7 +80,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NM, asm_exc_device_not_available), INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, asm_exc_invalid_tss), - INTG(X86_TRAP_NP, segment_not_present), + INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, stack_segment), INTG(X86_TRAP_GP, general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 10ab0837668f..88ba5f0400fd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -258,6 +258,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss) 0, NULL); } +DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) +{ + do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP, + SIGBUS, 0, NULL); +} + #define IP ((void __user *)uprobe_get_trap_addr(regs)) #define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ @@ -265,7 +271,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ } -DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) #undef IP -- cgit From fd9689bf91131c4bea5ea54f828af5267f5ed6a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:24 +0100 Subject: x86/entry: Convert Stack segment exception to IDTENTRY Convert #SS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.539867572@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b9acc7f5684a..8d95cbf56624 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -81,7 +81,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun), INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, asm_exc_segment_not_present), - INTG(X86_TRAP_SS, stack_segment), + INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 88ba5f0400fd..3dfdc4d3de87 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -264,16 +264,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present) SIGBUS, 0, NULL); } -#define IP ((void __user *)uprobe_get_trap_addr(regs)) -#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \ +DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) +{ + do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS, + 0, NULL); } -DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment) -#undef IP - dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) { char *str = "alignment check"; -- cgit From be4c11afbb6d5317274e61fda0edf744080fb72b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:25 +0100 Subject: x86/entry: Convert General protection exception to IDTENTRY Convert #GP to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.637269946@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8d95cbf56624..6f0af12f08c8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -82,7 +82,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_TS, asm_exc_invalid_tss), INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, asm_exc_stack_segment), - INTG(X86_TRAP_GP, general_protection), + INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), INTG(X86_TRAP_AC, alignment_check), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3dfdc4d3de87..e65c7612ecf3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -145,7 +145,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, * process no chance to handle the signal and notice the * kernel fault information, so that won't result in polluting * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. + * delivered, faults. See also exc_general_protection below. */ tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -375,7 +375,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * which is what the stub expects, given that the faulting * RIP will be the IRET instruction. */ - regs->ip = (unsigned long)general_protection; + regs->ip = (unsigned long)asm_exc_general_protection; regs->sp = (unsigned long)&gpregs->orig_ax; return; @@ -494,7 +494,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs, #define GPFSTR "general protection fault" -dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_ERRORCODE(exc_general_protection) { char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR; enum kernel_gp_hint hint = GP_NO_HINT; @@ -502,7 +502,6 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) unsigned long gp_addr; int ret; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); cond_local_irq_enable(regs); if (static_cpu_has(X86_FEATURE_UMIP)) { @@ -570,7 +569,6 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) exit: cond_local_irq_disable(regs); } -NOKPROBE_SYMBOL(do_general_protection); dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { -- cgit From dad7106f8194df1b096666c5499ef732497ddb15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:26 +0100 Subject: x86/entry: Convert Spurious interrupt bug exception to IDTENTRY Convert #SPURIOUS to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.728077036@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6f0af12f08c8..8e8936dcf6e4 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -83,7 +83,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_NP, asm_exc_segment_not_present), INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, asm_exc_general_protection), - INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug), + INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, coprocessor_error), INTG(X86_TRAP_AC, alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e65c7612ecf3..2c638b9bc827 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -867,8 +867,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) math_error(regs, error_code, X86_TRAP_XF); } -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_spurious_interrupt_bug) { /* * This addresses a Pentium Pro Erratum: -- cgit From 14a8bd2aa7c355b3a8879618a4f70f9c2b0004f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:27 +0100 Subject: x86/entry: Convert Coprocessor error exception to IDTENTRY Convert #MF to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.838823510@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 8e8936dcf6e4..2bde50d4cfa1 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -84,7 +84,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_SS, asm_exc_stack_segment), INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), - INTG(X86_TRAP_MF, coprocessor_error), + INTG(X86_TRAP_MF, asm_exc_coprocessor_error), INTG(X86_TRAP_AC, alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2c638b9bc827..ba26bebfed72 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -854,10 +854,9 @@ exit: cond_local_irq_disable(regs); } -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_MF); + math_error(regs, 0, X86_TRAP_MF); } dotraplinkage void -- cgit From 436608bb00a59f5457cee26f416067860ca88d9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:28 +0100 Subject: x86/entry: Convert Alignment check exception to IDTENTRY Convert #AC to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134905.928967113@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 2bde50d4cfa1..af4819610783 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -85,7 +85,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_GP, asm_exc_general_protection), INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, asm_exc_coprocessor_error), - INTG(X86_TRAP_AC, alignment_check), + INTG(X86_TRAP_AC, asm_exc_alignment_check), INTG(X86_TRAP_XF, simd_coprocessor_error), #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ba26bebfed72..9f156c84195d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -270,12 +270,10 @@ DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment) 0, NULL); } -dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check) { char *str = "alignment check"; - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP) return; -- cgit From 48227e21f7430e31042f63e078a45cd230e9fdfc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:29 +0100 Subject: x86/entry: Convert SIMD coprocessor error exception to IDTENTRY Convert #XF to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY - Emit the ASM stub with DECLARE_IDTENTRY - Handle INVD_BUG in C - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134906.021552202@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 29 +++++++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index af4819610783..38b565b7e5b8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -86,7 +86,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug), INTG(X86_TRAP_MF, asm_exc_coprocessor_error), INTG(X86_TRAP_AC, asm_exc_alignment_check), - INTG(X86_TRAP_XF, simd_coprocessor_error), + INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error), #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f156c84195d..1702922ebd9c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -810,7 +810,7 @@ NOKPROBE_SYMBOL(do_debug); * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -static void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; struct fpu *fpu = &task->thread.fpu; @@ -821,15 +821,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (fixup_exception(regs, trapnr, error_code, 0)) + if (fixup_exception(regs, trapnr, 0, 0)) goto exit; - task->thread.error_code = error_code; + task->thread.error_code = 0; task->thread.trap_nr = trapnr; - if (notify_die(DIE_TRAP, str, regs, error_code, - trapnr, SIGFPE) != NOTIFY_STOP) - die(str, regs, error_code); + if (notify_die(DIE_TRAP, str, regs, 0, trapnr, + SIGFPE) != NOTIFY_STOP) + die(str, regs, 0); goto exit; } @@ -839,7 +839,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) fpu__save(fpu); task->thread.trap_nr = trapnr; - task->thread.error_code = error_code; + task->thread.error_code = 0; si_code = fpu__exception_code(fpu, trapnr); /* Retry when we get spurious exceptions: */ @@ -854,14 +854,19 @@ exit: DEFINE_IDTENTRY(exc_coprocessor_error) { - math_error(regs, 0, X86_TRAP_MF); + math_error(regs, X86_TRAP_MF); } -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY(exc_simd_coprocessor_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - math_error(regs, error_code, X86_TRAP_XF); + if (IS_ENABLED(CONFIG_X86_INVD_BUG)) { + /* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */ + if (!static_cpu_has(X86_FEATURE_XMM)) { + __exc_general_protection(regs, 0); + return; + } + } + math_error(regs, X86_TRAP_XF); } DEFINE_IDTENTRY(exc_spurious_interrupt_bug) -- cgit From d77290507ab2ac691d50389e255ebd11a6cbc35a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:30 +0100 Subject: x86/entry/32: Convert IRET exception to IDTENTRY_SW Convert the IRET exception handler to IDTENTRY_SW. This is slightly different than the conversions of hardware exceptions as the IRET exception is invoked via an exception table when IRET faults. So it just uses the IDTENTRY_SW mechanism for consistency. It does not emit ASM code as it does not fit the other idtentry exceptions. - Implement the C entry point with DEFINE_IDTENTRY_SW() which maps to DEFINE_IDTENTRY() - Fixup the XEN/PV code - Remove the old prototypes - Remove the RCU warning as the new entry macro ensures correctness No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505134906.128769226@linutronix.de --- arch/x86/kernel/traps.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1702922ebd9c..b28a64d7691f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -925,14 +925,12 @@ DEFINE_IDTENTRY(exc_device_not_available) } #ifdef CONFIG_X86_32 -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_SW(iret_error) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); - - if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + if (notify_die(DIE_TRAP, "iret exception", regs, 0, X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0, ILL_BADSTK, (void __user *)NULL); } local_irq_disable(); -- cgit From 4979fb53ab0ed35eddd20a73c25a5597bc22a57f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jan 2020 15:53:09 +0100 Subject: x86/int3: Ensure that poke_int3_handler() is not traced In order to ensure poke_int3_handler() is completely self contained -- this is called while modifying other text, imagine the fun of hitting another INT3 -- ensure that everything it uses is not traced. The primary means here is to force inlining; bsearch() is notrace because all of lib/ is. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.410702173@linutronix.de --- arch/x86/kernel/alternative.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a9195ce8265d..dd81ed5beeca 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1011,7 +1011,8 @@ struct bp_patching_desc { static struct bp_patching_desc *bp_desc; -static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +static __always_inline +struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ @@ -1021,18 +1022,18 @@ static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **de return desc; } -static inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_desc(struct bp_patching_desc *desc) { smp_mb__before_atomic(); atomic_dec(&desc->refs); } -static inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(struct text_poke_loc *tp) { return _stext + tp->rel_addr; } -static int notrace patch_cmp(const void *key, const void *elt) +static int noinstr patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -1042,9 +1043,8 @@ static int notrace patch_cmp(const void *key, const void *elt) return 1; return 0; } -NOKPROBE_SYMBOL(patch_cmp); -int notrace poke_int3_handler(struct pt_regs *regs) +int noinstr poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; @@ -1118,7 +1118,6 @@ out_put: put_desc(desc); return ret; } -NOKPROBE_SYMBOL(poke_int3_handler); #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) static struct text_poke_loc tp_vec[TP_VEC_MAX]; -- cgit From ef882bfef933408360e4d9d0c2c83a1e2fc996f3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jan 2020 22:08:45 +0100 Subject: x86/int3: Avoid atomic instrumentation Use arch_atomic_*() and __READ_ONCE() to ensure nothing untoward creeps in and ruins things. That is; this is the INT3 text poke handler, strictly limit the code that runs in it, lest it inadvertenly hits yet another INT3. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.517429268@linutronix.de --- arch/x86/kernel/alternative.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index dd81ed5beeca..50a8d24a417e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1014,9 +1014,9 @@ static struct bp_patching_desc *bp_desc; static __always_inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) { - struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ + struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */ - if (!desc || !atomic_inc_not_zero(&desc->refs)) + if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) return NULL; return desc; @@ -1025,7 +1025,7 @@ struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) static __always_inline void put_desc(struct bp_patching_desc *desc) { smp_mb__before_atomic(); - atomic_dec(&desc->refs); + arch_atomic_dec(&desc->refs); } static __always_inline void *text_poke_addr(struct text_poke_loc *tp) -- cgit From f64366efd8c60b93138b813d071d2cd201fd0f6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Feb 2020 13:28:06 +0100 Subject: x86/int3: Inline bsearch() Avoid calling out to bsearch() by inlining it, for normal kernel configs this was the last external call and poke_int3_handler() is now fully self sufficient -- no calls to external code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.731774429@linutronix.de --- arch/x86/kernel/alternative.c | 8 ++++---- arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 50a8d24a417e..8fd39ff74a49 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1033,7 +1033,7 @@ static __always_inline void *text_poke_addr(struct text_poke_loc *tp) return _stext + tp->rel_addr; } -static int noinstr patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *key, const void *elt) { struct text_poke_loc *tp = (struct text_poke_loc *) elt; @@ -1077,9 +1077,9 @@ int noinstr poke_int3_handler(struct pt_regs *regs) * Skip the binary search if there is a single member in the vector. */ if (unlikely(desc->nr_entries > 1)) { - tp = bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), - patch_cmp); + tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); if (!tp) goto out_put; } else { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b28a64d7691f..280c290f414f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -570,6 +570,11 @@ exit: dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { + /* + * poke_int3_handler() is completely self contained code; it does (and + * must) *NOT* call out to anything, lest it hits upon yet another + * INT3. + */ if (poke_int3_handler(regs)) return; -- cgit From 8edd7e37aed8b9df938a63f0b0259c70569ce3d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:16:16 +0100 Subject: x86/entry: Convert INT3 exception to IDTENTRY_RAW Convert #BP to IDTENTRY_RAW: - Implement the C entry point with DEFINE_IDTENTRY_RAW - Invoke idtentry_enter/exit() from the function body - Emit the ASM stub with DECLARE_IDTENTRY_RAW - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. This could be a plain IDTENTRY, but as Peter pointed out INT3 is broken vs. the static key in the context tracking code as this static key might be in the state of being patched and has an int3 which would recurse forever. IDTENTRY_RAW is therefore chosen to allow addressing this issue without lots of code churn. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135313.938474960@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/traps.c | 28 +++++++++++++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 38b565b7e5b8..9ca8af65a212 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -60,7 +60,7 @@ static bool idt_setup_done __initdata; */ static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, debug), - SYSG(X86_TRAP_BP, int3), + SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 INTG(X86_TRAP_PF, page_fault), #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 280c290f414f..0ad12dffde22 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -568,7 +568,7 @@ exit: cond_local_irq_disable(regs); } -dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_RAW(exc_int3) { /* * poke_int3_handler() is completely self contained code; it does (and @@ -579,16 +579,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from pretty much - * any location in the kernel through kprobes -- text_poke() will most - * likely be handled by poke_int3_handler() above. This means this - * handler is effectively NMI-like. + * idtentry_enter() uses static_branch_{,un}likely() and therefore + * can trigger INT3, hence poke_int3_handler() must be done + * before. If the entry came from kernel mode, then use nmi_enter() + * because the INT3 could have been hit in any context including + * NMI. */ - if (!user_mode(regs)) + if (user_mode(regs)) + idtentry_enter(regs); + else nmi_enter(); + instrumentation_begin(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ @@ -598,19 +602,21 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) goto exit; #endif - if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) goto exit; cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) + instrumentation_end(); + if (user_mode(regs)) + idtentry_exit(regs); + else nmi_exit(); } -NOKPROBE_SYMBOL(do_int3); #ifdef CONFIG_X86_64 /* -- cgit From 21e28290b31708b72763641604e239eb369c230d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2020 16:09:52 +0100 Subject: x86/traps: Split int3 handler up For code simplicity split up the int3 handler into a kernel and user part which makes the code flow simpler to understand. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505135314.045220765@linutronix.de --- arch/x86/kernel/traps.c | 68 +++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0ad12dffde22..21c8cfce24d3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -568,6 +568,35 @@ exit: cond_local_irq_disable(regs); } +static bool do_int3(struct pt_regs *regs) +{ + int res; + +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + return true; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ + +#ifdef CONFIG_KPROBES + if (kprobe_int3_handler(regs)) + return true; +#endif + res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP); + + return res == NOTIFY_STOP; +} + +static void do_int3_user(struct pt_regs *regs) +{ + if (do_int3(regs)) + return; + + cond_local_irq_enable(regs); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); + cond_local_irq_disable(regs); +} + DEFINE_IDTENTRY_RAW(exc_int3) { /* @@ -585,37 +614,20 @@ DEFINE_IDTENTRY_RAW(exc_int3) * because the INT3 could have been hit in any context including * NMI. */ - if (user_mode(regs)) + if (user_mode(regs)) { idtentry_enter(regs); - else - nmi_enter(); - - instrumentation_begin(); -#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP - if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; -#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ - -#ifdef CONFIG_KPROBES - if (kprobe_int3_handler(regs)) - goto exit; -#endif - - if (notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, - SIGTRAP) == NOTIFY_STOP) - goto exit; - - cond_local_irq_enable(regs); - do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL); - cond_local_irq_disable(regs); - -exit: - instrumentation_end(); - if (user_mode(regs)) + instrumentation_begin(); + do_int3_user(regs); + instrumentation_end(); idtentry_exit(regs); - else + } else { + nmi_enter(); + instrumentation_begin(); + if (!do_int3(regs)) + die("int3", regs, 0); + instrumentation_end(); nmi_exit(); + } } #ifdef CONFIG_X86_64 -- cgit From 94a46d316f2b54e3de8a4fa884cb16383db7fcd8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2020 22:37:31 +0200 Subject: x86/mce: Move nmi_enter/exit() into the entry point There is no reason to have nmi_enter/exit() in the actual MCE handlers. Move it to the entry point. This also covers the until now uncovered initial handler which only prints. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.243936614@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 26 +++++++++++++------------- arch/x86/kernel/cpu/mce/p5.c | 4 ---- arch/x86/kernel/cpu/mce/winchip.c | 4 ---- 3 files changed, 13 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index e9265e2f28c9..f5993ed6e16b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1100,8 +1100,10 @@ static void mce_clear_state(unsigned long *toclear) * kdump kernel establishing a new #MC handler where a broadcasted MCE * might not get handled properly. */ -static bool __mc_check_crashing_cpu(int cpu) +static noinstr bool mce_check_crashing_cpu(void) { + unsigned int cpu = smp_processor_id(); + if (cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; @@ -1235,7 +1237,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); struct mca_config *cfg = &mca_cfg; - int cpu = smp_processor_id(); struct mce m, *final; char *msg = NULL; int worst = 0; @@ -1264,11 +1265,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) */ int lmce = 1; - if (__mc_check_crashing_cpu(cpu)) - return; - - nmi_enter(); - this_cpu_inc(mce_exception_count); mce_gather_info(&m, regs); @@ -1356,7 +1352,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) sync_core(); if (worst != MCE_AR_SEVERITY && !kill_it) - goto out_ist; + return; /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { @@ -1373,9 +1369,6 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } - -out_ist: - nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1912,11 +1905,18 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code) void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; -dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code) +dotraplinkage noinstr void do_mce(struct pt_regs *regs, long error_code) { + if (machine_check_vector == do_machine_check && + mce_check_crashing_cpu()) + return; + + nmi_enter(); + machine_check_vector(regs, error_code); + + nmi_exit(); } -NOKPROBE_SYMBOL(do_mce); /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 5ee94aa1b766..dc29f0f7b3ed 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -25,8 +25,6 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - nmi_enter(); - rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,8 +37,6 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index b3938c195365..3f8f84ba0f51 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -19,12 +19,8 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - nmi_enter(); - pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - - nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ -- cgit From 8cd501c1facc159dff6db63775151c9200a3ea1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:23 +0100 Subject: x86/entry: Convert Machine Check to IDTENTRY_IST Convert #MC to IDTENTRY_MCE: - Implement the C entry points with DEFINE_IDTENTRY_MCE - Emit the ASM stub with DECLARE_IDTENTRY_MCE - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes - Remove the error code from *machine_check_vector() as it is always 0 and not used by any of the functions it can point to. Fixup all the functions as well. No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.334980426@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 23 ++++++++++++++--------- arch/x86/kernel/cpu/mce/inject.c | 4 ++-- arch/x86/kernel/cpu/mce/internal.h | 2 +- arch/x86/kernel/cpu/mce/p5.c | 2 +- arch/x86/kernel/cpu/mce/winchip.c | 2 +- arch/x86/kernel/idt.c | 10 +++++----- 6 files changed, 24 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f5993ed6e16b..842dd03c3918 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1232,7 +1232,7 @@ static void kill_me_maybe(struct callback_head *cb) * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void noinstr do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1366,7 +1366,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) current->mce_kill_me.func = kill_me_now; task_work_add(current, ¤t->mce_kill_me, true); } else { - if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) + if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } } @@ -1895,27 +1895,32 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) +static void unexpected_machine_check(struct pt_regs *regs) { pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; -dotraplinkage noinstr void do_mce(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_MCE(exc_machine_check) { if (machine_check_vector == do_machine_check && mce_check_crashing_cpu()) return; - nmi_enter(); + if (user_mode(regs)) + idtentry_enter(regs); + else + nmi_enter(); - machine_check_vector(regs, error_code); + machine_check_vector(regs); - nmi_exit(); + if (user_mode(regs)) + idtentry_exit(regs); + else + nmi_exit(); } /* diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 3413b41b8d55..0593b192eb8f 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) regs.cs = m->cs; pregs = ®s; } - /* in mcheck exeception handler, irq will be disabled */ + /* do_machine_check() expects interrupts disabled -- at least */ local_irq_save(flags); - do_machine_check(pregs, 0); + do_machine_check(pregs); local_irq_restore(flags); m->finished = 0; } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 3b008172ad73..b74ca4a28c66 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -9,7 +9,7 @@ #include /* Pointer to the installed machine check handler for this CPU setup. */ -extern void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *); enum severity_level { MCE_NO_SEVERITY, diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index dc29f0f7b3ed..eaebc4ce7398 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,7 +21,7 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs, long error_code) +static void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 3f8f84ba0f51..90e3d60c645e 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,7 +17,7 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs, long error_code) +static void winchip_machine_check(struct pt_regs *regs) { pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 9ca8af65a212..6b93840784d5 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -96,7 +96,7 @@ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DB, debug), #ifdef CONFIG_X86_MCE - INTG(X86_TRAP_MC, machine_check), + INTG(X86_TRAP_MC, asm_exc_machine_check), #endif SYSG(X86_TRAP_OF, asm_exc_overflow), @@ -185,11 +185,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, machine_check, IST_INDEX_MCE), + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif }; -- cgit From aedbdeab00dcfcc6d751f9fb1b4896b01911d494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2020 15:39:13 +0200 Subject: x86/mce: Use untraced rd/wrmsr in the MCE offline/crash check mce_check_crashing_cpu() is called right at the entry of the MCE handler. It uses mce_rdmsr() and mce_wrmsr() which are wrappers around rdmsr() and wrmsr() to handle the MCE error injection mechanism, which is pointless in this context, i.e. when the MCE hits an offline CPU or the system is already marked crashing. The MSR access can also be traced, so use the untraceable variants. This is also safe vs. XEN paravirt as these MSRs are not affected by XEN PV modifications. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.426347351@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 842dd03c3918..317765245190 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1108,7 +1108,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1116,7 +1116,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); return true; } } -- cgit From 6271fef00b3489690e52ce95edbc378357513547 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:25 +0100 Subject: x86/entry: Convert NMI to IDTENTRY_NMI Convert #NMI to IDTENTRY_NMI: - Implement the C entry point with DEFINE_IDTENTRY_NMI - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.609932306@linutronix.de --- arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/nmi.c | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6b93840784d5..d3fecd88677c 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -74,7 +74,7 @@ static const __initconst struct idt_data early_idts[] = { */ static const __initconst struct idt_data def_idts[] = { INTG(X86_TRAP_DE, asm_exc_divide_error), - INTG(X86_TRAP_NMI, nmi), + INTG(X86_TRAP_NMI, asm_exc_nmi), INTG(X86_TRAP_BR, asm_exc_bounds), INTG(X86_TRAP_UD, asm_exc_invalid_op), INTG(X86_TRAP_NM, asm_exc_device_not_available), @@ -186,7 +186,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; */ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bdcc5146de96..3b05cc802abb 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -503,8 +503,7 @@ static bool notrace is_debug_stack(unsigned long addr) NOKPROBE_SYMBOL(is_debug_stack); #endif -dotraplinkage notrace void -do_nmi(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_NMI(exc_nmi) { if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) return; @@ -554,7 +553,6 @@ nmi_restart: if (user_mode(regs)) mds_user_clear_cpu_buffers(); } -NOKPROBE_SYMBOL(do_nmi); void stop_nmi(void) { -- cgit From f051f697955049c7cf10a635ab8149aa619243b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Apr 2020 15:55:06 +0200 Subject: x86/nmi: Protect NMI entry against instrumentation Mark all functions in the fragile code parts noinstr or force inlining so they can't be instrumented. Also make the hardware latency tracer invocation explicit outside of non-instrumentable section. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.716186134@linutronix.de --- arch/x86/kernel/cpu/common.c | 6 ++---- arch/x86/kernel/nmi.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8be042df12c3..f4645f9ff9cb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1709,21 +1709,19 @@ void syscall_init(void) DEFINE_PER_CPU(int, debug_stack_usage); DEFINE_PER_CPU(u32, debug_idt_ctr); -void debug_stack_set_zero(void) +noinstr void debug_stack_set_zero(void) { this_cpu_inc(debug_idt_ctr); load_current_idt(); } -NOKPROBE_SYMBOL(debug_stack_set_zero); -void debug_stack_reset(void) +noinstr void debug_stack_reset(void) { if (WARN_ON(!this_cpu_read(debug_idt_ctr))) return; if (this_cpu_dec_return(debug_idt_ctr) == 0) load_current_idt(); } -NOKPROBE_SYMBOL(debug_stack_reset); #else /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3b05cc802abb..3052c78f03aa 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -303,7 +303,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error); static DEFINE_PER_CPU(bool, swallow_nmi); static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static void default_do_nmi(struct pt_regs *regs) +static noinstr void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int handled; @@ -329,6 +329,8 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); + instrumentation_begin(); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -342,7 +344,7 @@ static void default_do_nmi(struct pt_regs *regs) */ if (handled > 1) __this_cpu_write(swallow_nmi, true); - return; + goto out; } /* @@ -374,7 +376,7 @@ static void default_do_nmi(struct pt_regs *regs) #endif __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); - return; + goto out; } raw_spin_unlock(&nmi_reason_lock); @@ -412,8 +414,10 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_add(nmi_stats.swallow, 1); else unknown_nmi_error(reason, regs); + +out: + instrumentation_end(); } -NOKPROBE_SYMBOL(default_do_nmi); /* * NMIs can page fault or hit breakpoints which will cause it to lose @@ -485,7 +489,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); */ static DEFINE_PER_CPU(int, update_debug_stack); -static bool notrace is_debug_stack(unsigned long addr) +static noinstr bool is_debug_stack(unsigned long addr) { struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); unsigned long top = CEA_ESTACK_TOP(cs, DB); @@ -500,7 +504,6 @@ static bool notrace is_debug_stack(unsigned long addr) */ return addr >= bot && addr < top; } -NOKPROBE_SYMBOL(is_debug_stack); #endif DEFINE_IDTENTRY_NMI(exc_nmi) -- cgit From 9f58fdde95c9509a4ded27a6d0035e79294002b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2020 21:02:56 +0200 Subject: x86/db: Split out dr6/7 handling DR6/7 should be handled before nmi_enter() is invoked and restore after nmi_exit() to minimize the exposure. Split it out into helper inlines and bring it into the correct order. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.808628211@linutronix.de --- arch/x86/kernel/hw_breakpoint.c | 6 +--- arch/x86/kernel/traps.c | 75 ++++++++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42fc0eaf193..9ddf441ccaa8 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -464,7 +464,7 @@ static int hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct perf_event *bp; - unsigned long dr7, dr6; + unsigned long dr6; unsigned long *dr6_p; /* The DR6 value is pointed by args->err */ @@ -479,9 +479,6 @@ static int hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - get_debugreg(dr7, 7); - /* Disable breakpoints during exception handling */ - set_debugreg(0UL, 7); /* * Assert that local interrupts are disabled * Reset the DRn bits in the virtualized register value. @@ -538,7 +535,6 @@ static int hw_breakpoint_handler(struct die_args *args) (dr6 & (~DR_TRAP_BITS))) rc = NOTIFY_DONE; - set_debugreg(dr7, 7); put_cpu(); return rc; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 21c8cfce24d3..de5120e2fbe1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -700,6 +700,57 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } +static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) +{ + /* + * Disable breakpoints during exception handling; recursive exceptions + * are exceedingly 'fun'. + * + * Since this function is NOKPROBE, and that also applies to + * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a + * HW_BREAKPOINT_W on our stack) + * + * Entry text is excluded for HW_BP_X and cpu_entry_area, which + * includes the entry stack is excluded for everything. + */ + get_debugreg(*dr7, 7); + set_debugreg(0, 7); + + /* + * Ensure the compiler doesn't lower the above statements into + * the critical section; disabling breakpoints late would not + * be good. + */ + barrier(); + + /* + * The Intel SDM says: + * + * Certain debug exceptions may clear bits 0-3. The remaining + * contents of the DR6 register are never cleared by the + * processor. To avoid confusion in identifying debug + * exceptions, debug handlers should clear the register before + * returning to the interrupted task. + * + * Keep it simple: clear DR6 immediately. + */ + get_debugreg(*dr6, 6); + set_debugreg(0, 6); + /* Filter out all the reserved bits which are preset to 1 */ + *dr6 &= ~DR6_RESERVED; +} + +static __always_inline void debug_exit(unsigned long dr7) +{ + /* + * Ensure the compiler doesn't raise this statement into + * the critical section; enabling breakpoints early would + * not be good. + */ + barrier(); + set_debugreg(dr7, 7); +} + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -727,28 +778,13 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) dotraplinkage void do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; + unsigned long dr6, dr7; int user_icebp = 0; - unsigned long dr6; int si_code; - nmi_enter(); - - get_debugreg(dr6, 6); - /* - * The Intel SDM says: - * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. - * - * Keep it simple: clear DR6 immediately. - */ - set_debugreg(0, 6); + debug_enter(&dr6, &dr7); - /* Filter out all the reserved bits which are preset to 1 */ - dr6 &= ~DR6_RESERVED; + nmi_enter(); /* * The SDM says "The processor clears the BTF flag when it @@ -786,7 +822,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) goto exit; /* @@ -825,6 +861,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) exit: nmi_exit(); + debug_exit(dr7); } NOKPROBE_SYMBOL(do_debug); -- cgit From 2bbc68f8373c0631ebf137f376fbea00e8086be7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:26 +0100 Subject: x86/entry: Convert Debug exception to IDTENTRY_DB Convert #DB to IDTENTRY_ERRORCODE: - Implement the C entry point with DEFINE_IDTENTRY_DB - Emit the ASM stub with DECLARE_IDTENTRY - Remove the ASM idtentry in 64bit - Remove the open coded ASM entry code in 32bit - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135314.900297476@linutronix.de --- arch/x86/kernel/idt.c | 8 ++++---- arch/x86/kernel/traps.c | 21 +++++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index d3fecd88677c..ddf3f3db3235 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -59,7 +59,7 @@ static bool idt_setup_done __initdata; * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_idts[] = { - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 INTG(X86_TRAP_PF, page_fault), @@ -93,7 +93,7 @@ static const __initconst struct idt_data def_idts[] = { #else INTG(X86_TRAP_DF, double_fault), #endif - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), #ifdef CONFIG_X86_MCE INTG(X86_TRAP_MC, asm_exc_machine_check), @@ -164,7 +164,7 @@ static const __initconst struct idt_data early_pf_idts[] = { * stack set to DEFAULT_STACK (0). Required for NMI trap handling. */ static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, debug), + INTG(X86_TRAP_DB, asm_exc_debug), }; #endif @@ -185,7 +185,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de5120e2fbe1..569408a681b6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,7 +775,7 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -dotraplinkage void do_debug(struct pt_regs *regs, long error_code) +DEFINE_IDTENTRY_DEBUG(exc_debug) { struct task_struct *tsk = current; unsigned long dr6, dr7; @@ -784,7 +784,10 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_enter(&dr6, &dr7); - nmi_enter(); + if (user_mode(regs)) + idtentry_enter(regs); + else + nmi_enter(); /* * The SDM says "The processor clears the BTF flag when it @@ -821,7 +824,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) goto exit; #endif - if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) goto exit; @@ -835,8 +838,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) cond_local_irq_enable(regs); if (v8086_mode(regs)) { - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, - X86_TRAP_DB); + handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, + X86_TRAP_DB); cond_local_irq_disable(regs); debug_stack_usage_dec(); goto exit; @@ -855,15 +858,17 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) } si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(regs, error_code, si_code); + send_sigtrap(regs, 0, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); exit: - nmi_exit(); + if (user_mode(regs)) + idtentry_exit(regs); + else + nmi_exit(); debug_exit(dr7); } -NOKPROBE_SYMBOL(do_debug); /* * Note that we play around with the 'TS' bit in an attempt to get -- cgit From 4c0dcd8350a03cb65f645a039f2772be880ee74a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:29 +0100 Subject: x86/entry: Implement user mode C entry points for #DB and #MCE The MCE entry point uses the same mechanism as the IST entry point for now. For #DB split the inner workings and just keep the nmi_enter/exit() magic in the IST variant. Fixup the ASM code to emit the proper noist_##cfunc call. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.177564104@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 40 +++++++++++++++++++----- arch/x86/kernel/traps.c | 70 ++++++++++++++++++++++++++++++++---------- 2 files changed, 87 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 317765245190..a72c0135a5ec 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1904,24 +1904,50 @@ static void unexpected_machine_check(struct pt_regs *regs) /* Call the installed machine check handler for this CPU setup. */ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; -DEFINE_IDTENTRY_MCE(exc_machine_check) +static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { + /* + * Only required when from kernel mode. See + * mce_check_crashing_cpu() for details. + */ if (machine_check_vector == do_machine_check && mce_check_crashing_cpu()) return; - if (user_mode(regs)) - idtentry_enter(regs); - else - nmi_enter(); + nmi_enter(); + machine_check_vector(regs); + nmi_exit(); +} +static __always_inline void exc_machine_check_user(struct pt_regs *regs) +{ + idtentry_enter(regs); machine_check_vector(regs); + idtentry_exit(regs); +} +#ifdef CONFIG_X86_64 +/* MCE hit kernel mode */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ + exc_machine_check_kernel(regs); +} + +/* The user mode variant. */ +DEFINE_IDTENTRY_MCE_USER(exc_machine_check) +{ + exc_machine_check_user(regs); +} +#else +/* 32bit unified entry point */ +DEFINE_IDTENTRY_MCE(exc_machine_check) +{ if (user_mode(regs)) - idtentry_exit(regs); + exc_machine_check_user(regs); else - nmi_exit(); + exc_machine_check_kernel(regs); } +#endif /* * Called for each booted CPU to set up machine checks. diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 569408a681b6..4f248c5d5cab 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,20 +775,12 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -DEFINE_IDTENTRY_DEBUG(exc_debug) +static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) { struct task_struct *tsk = current; - unsigned long dr6, dr7; int user_icebp = 0; int si_code; - debug_enter(&dr6, &dr7); - - if (user_mode(regs)) - idtentry_enter(regs); - else - nmi_enter(); - /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep @@ -800,7 +792,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) is_sysenter_singlestep(regs))) { dr6 &= ~DR_STEP; if (!dr6) - goto exit; + return; /* * else we might have gotten a single-step trap and hit a * watchpoint at the same time, in which case we should fall @@ -821,12 +813,12 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) #ifdef CONFIG_KPROBES if (kprobe_debug_handler(regs)) - goto exit; + return; #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) - goto exit; + return; /* * Let others (NMI) know that the debug stack is in use @@ -842,7 +834,7 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) X86_TRAP_DB); cond_local_irq_disable(regs); debug_stack_usage_dec(); - goto exit; + return; } if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { @@ -861,14 +853,60 @@ DEFINE_IDTENTRY_DEBUG(exc_debug) send_sigtrap(regs, 0, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); +} + +static __always_inline void exc_debug_kernel(struct pt_regs *regs, + unsigned long dr6) +{ + nmi_enter(); + handle_debug(regs, dr6); + nmi_exit(); +} + +static __always_inline void exc_debug_user(struct pt_regs *regs, + unsigned long dr6) +{ + idtentry_enter(regs); + handle_debug(regs, dr6); + idtentry_exit(regs); +} + +#ifdef CONFIG_X86_64 +/* IST stack entry */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_kernel(regs, dr6); + debug_exit(dr7); +} + +/* User entry, runs on regular task stack */ +DEFINE_IDTENTRY_DEBUG_USER(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); + exc_debug_user(regs, dr6); + debug_exit(dr7); +} +#else +/* 32 bit does not have separate entry points. */ +DEFINE_IDTENTRY_DEBUG(exc_debug) +{ + unsigned long dr6, dr7; + + debug_enter(&dr6, &dr7); -exit: if (user_mode(regs)) - idtentry_exit(regs); + exc_debug_user(regs, dr6); else - nmi_exit(); + exc_debug_kernel(regs, dr6); + debug_exit(dr7); } +#endif /* * Note that we play around with the 'TS' bit in an attempt to get -- cgit From 9347f41352181bf4a7e663f7b5f4a4bb32244d73 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 May 2020 19:56:26 +0200 Subject: x86/traps: Restructure #DB handling Now that there are separate entry points, move the kernel/user_mode specifc checks into the entry functions so the common handling code does not need the extra mode checks. Make the code more readable while at it. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.283276272@linutronix.de --- arch/x86/kernel/traps.c | 69 +++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4f248c5d5cab..b62e962871f2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,39 +775,12 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) +static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, + bool user_icebp) { struct task_struct *tsk = current; - int user_icebp = 0; int si_code; - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); - - if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) && - is_sysenter_singlestep(regs))) { - dr6 &= ~DR_STEP; - if (!dr6) - return; - /* - * else we might have gotten a single-step trap and hit a - * watchpoint at the same time, in which case we should fall - * through and handle the watchpoint. - */ - } - - /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. - */ - if (!dr6 && user_mode(regs)) - user_icebp = 1; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -832,9 +805,7 @@ static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, 0, X86_TRAP_DB); - cond_local_irq_disable(regs); - debug_stack_usage_dec(); - return; + goto out; } if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) { @@ -848,9 +819,12 @@ static noinstr void handle_debug(struct pt_regs *regs, unsigned long dr6) set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; } + si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(regs, 0, si_code); + +out: cond_local_irq_disable(regs); debug_stack_usage_dec(); } @@ -859,7 +833,27 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { nmi_enter(); - handle_debug(regs, dr6); + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * Catch SYSENTER with TF set and clear DR_STEP. If this hit a + * watchpoint at the same time then that will still be handled. + */ + if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) + dr6 &= ~DR_STEP; + + /* + * If DR6 is zero, no point in trying to handle it. The kernel is + * not using INT1. + */ + if (dr6) + handle_debug(regs, dr6, false); + nmi_exit(); } @@ -867,7 +861,14 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { idtentry_enter(regs); - handle_debug(regs, dr6); + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + handle_debug(regs, dr6, !dr6); idtentry_exit(regs); } -- cgit From 75347bb2535a6d5549cc3e436467b7c40d7bb874 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 30 Apr 2020 11:07:20 +0200 Subject: x86/traps: Address objtool noinstr complaints in #DB The functions invoked from handle_debug() can be instrumented. Tell objtool about it. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.380927730@linutronix.de --- arch/x86/kernel/traps.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b62e962871f2..41bb0cb9df84 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -784,14 +784,19 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; + instrumentation_begin(); #ifdef CONFIG_KPROBES - if (kprobe_debug_handler(regs)) + if (kprobe_debug_handler(regs)) { + instrumentation_end(); return; + } #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, - SIGTRAP) == NOTIFY_STOP) + SIGTRAP) == NOTIFY_STOP) { + instrumentation_end(); return; + } /* * Let others (NMI) know that the debug stack is in use @@ -827,6 +832,7 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); debug_stack_usage_dec(); + instrumentation_end(); } static __always_inline void exc_debug_kernel(struct pt_regs *regs, -- cgit From 865d3a9afe7eddf320e7f61a442864d6efe27505 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 21:22:36 +0200 Subject: x86/mce: Address objtools noinstr complaints Mark the relevant functions noinstr, use the plain non-instrumented MSR accessors. The only odd part is the instrumentation_begin()/end() pair around the indirect machine_check_vector() call as objtool can't figure that out. The possible invoked functions are annotated correctly. Also use notrace variant of nmi_enter/exit(). If MCEs happen then hardware latency tracing is the least of the worries. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.476734898@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 20 +++++++++++++++----- arch/x86/kernel/cpu/mce/p5.c | 4 +++- arch/x86/kernel/cpu/mce/winchip.c | 4 +++- 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a72c0135a5ec..a32a7e236bb1 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -130,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +noinstr void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); @@ -140,12 +140,12 @@ void mce_setup(struct mce *m) m->cpuid = cpuid_eax(1); m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; - rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) - rdmsrl(MSR_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_PPIN); else if (this_cpu_has(X86_FEATURE_AMD_PPIN)) - rdmsrl(MSR_AMD_PPIN, m->ppin); + m->ppin = __rdmsr(MSR_AMD_PPIN); m->microcode = boot_cpu_data.microcode; } @@ -1895,10 +1895,12 @@ bool filter_mce(struct mce *m) } /* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs) +static noinstr void unexpected_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); + instrumentation_end(); } /* Call the installed machine check handler for this CPU setup. */ @@ -1915,14 +1917,22 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) return; nmi_enter(); + /* + * The call targets are marked noinstr, but objtool can't figure + * that out because it's an indirect call. Annotate it. + */ + instrumentation_begin(); machine_check_vector(regs); + instrumentation_end(); nmi_exit(); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) { idtentry_enter(regs); + instrumentation_begin(); machine_check_vector(regs); + instrumentation_end(); idtentry_exit(regs); } diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index eaebc4ce7398..19e90cae8e97 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -21,10 +21,11 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ -static void pentium_machine_check(struct pt_regs *regs) +static noinstr void pentium_machine_check(struct pt_regs *regs) { u32 loaddr, hi, lotype; + instrumentation_begin(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -37,6 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs) } add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index 90e3d60c645e..9c9f0abd2d7f 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -17,10 +17,12 @@ #include "internal.h" /* Machine check handler for WinChip C6: */ -static void winchip_machine_check(struct pt_regs *regs) +static noinstr void winchip_machine_check(struct pt_regs *regs) { + instrumentation_begin(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + instrumentation_end(); } /* Set up machine check reporting on the Winchip C6 series */ -- cgit From c29c775a554f7060b6fb31b68f88a3c9087cf1c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Feb 2020 23:33:31 +0100 Subject: x86/entry: Convert double fault exception to IDTENTRY_DF Convert #DF to IDTENTRY_DF - Implement the C entry point with DEFINE_IDTENTRY_DF - Emit the ASM stub with DECLARE_IDTENTRY_DF on 64bit - Remove the ASM idtentry in 64bit - Adjust the 32bit shim code - Fixup the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200505135315.583415264@linutronix.de --- arch/x86/kernel/doublefault_32.c | 10 ++++------ arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/traps.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 2ccc57f152a4..759d392cbe9f 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -10,7 +10,6 @@ #include #include -extern void double_fault(void); #define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) #define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x) @@ -21,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu); * Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks * we're running the doublefault task. Cannot return. */ -asmlinkage notrace void __noreturn doublefault_shim(void) +asmlinkage noinstr void __noreturn doublefault_shim(void) { unsigned long cr2; struct pt_regs regs; @@ -40,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) * Fill in pt_regs. A downside of doing this in C is that the unwinder * won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump * won't successfully unwind to the source of the double fault. - * The main dump from do_double_fault() is fine, though, since it + * The main dump from exc_double_fault() is fine, though, since it * uses these regs directly. * * If anyone ever cares, this could be moved to asm. @@ -70,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void) regs.cx = TSS(cx); regs.bx = TSS(bx); - do_double_fault(®s, 0, cr2); + exc_double_fault(®s, 0, cr2); /* * x86_32 does not save the original CR3 anywhere on a task switch. @@ -84,7 +83,6 @@ asmlinkage notrace void __noreturn doublefault_shim(void) */ panic("cannot return from double fault\n"); } -NOKPROBE_SYMBOL(doublefault_shim); DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .tss = { @@ -95,7 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = { .ldt = 0, .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, - .ip = (unsigned long) double_fault, + .ip = (unsigned long) asm_exc_double_fault, .flags = X86_EFLAGS_FIXED, .es = __USER_DS, .cs = __KERNEL_CS, diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ddf3f3db3235..ec55479e1dd1 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -91,7 +91,7 @@ static const __initconst struct idt_data def_idts[] = { #ifdef CONFIG_X86_32 TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS), #else - INTG(X86_TRAP_DF, double_fault), + INTG(X86_TRAP_DF, asm_exc_double_fault), #endif INTG(X86_TRAP_DB, asm_exc_debug), @@ -187,7 +187,7 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 41bb0cb9df84..35298c1df32f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -319,12 +319,19 @@ __visible void __noreturn handle_stack_overflow(const char *message, * from the TSS. Returning is, in principle, okay, but changes to regs will * be lost. If, for some reason, we need to return to a context with modified * regs, the shim code could be adjusted to synchronize the registers. + * + * The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs + * to be read before doing anything else. */ -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2) +DEFINE_IDTENTRY_DF(exc_double_fault) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_X86_64 + unsigned long address = read_cr2(); +#endif + #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -381,6 +388,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign #endif nmi_enter(); + instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -424,13 +432,16 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) - handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); + if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { + handle_stack_overflow("kernel stack overflow (double-fault)", + regs, address); + } #endif pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); panic("Machine halted."); + instrumentation_end(); } DEFINE_IDTENTRY(exc_bounds) -- cgit From 7102cb07132624cdc09aa8e40c03ae34b4cbb74a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2020 09:42:41 +0200 Subject: x86/entry: Fix allnoconfig build warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following commit: 095b7a3e7745 ("x86/entry: Convert double fault exception to IDTENTRY_DF") introduced a new build warning on 64-bit allnoconfig kernels, that have CONFIG_VMAP_STACK disabled: arch/x86/kernel/traps.c:332:16: warning: unused variable ‘address’ [-Wunused-variable] This variable is only used if CONFIG_VMAP_STACK is defined, so make it dependent on that, not CONFIG_X86_64. Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Alexandre Chartre Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Borislav Petkov --- arch/x86/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 35298c1df32f..9e5d81cb94ba 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -328,7 +328,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) static const char str[] = "double fault"; struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); #endif -- cgit From fa95d7dc1abceb288db2959badb9aaf558eb0530 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:19 +0200 Subject: x86/idtentry: Switch to conditional RCU handling Switch all idtentry_enter/exit() users over to the new conditional RCU handling scheme and make the user mode entries in #DB, #INT3 and #MCE use the user mode idtentry functions. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.382387286@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 4 ++-- arch/x86/kernel/traps.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index a32a7e236bb1..c47f004f6231 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1929,11 +1929,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) static __always_inline void exc_machine_check_user(struct pt_regs *regs) { - idtentry_enter(regs); + idtentry_enter_user(regs); instrumentation_begin(); machine_check_vector(regs); instrumentation_end(); - idtentry_exit(regs); + idtentry_exit_user(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9e5d81cb94ba..f28be3e51cca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -619,18 +619,18 @@ DEFINE_IDTENTRY_RAW(exc_int3) return; /* - * idtentry_enter() uses static_branch_{,un}likely() and therefore + * idtentry_enter_user() uses static_branch_{,un}likely() and therefore * can trigger INT3, hence poke_int3_handler() must be done * before. If the entry came from kernel mode, then use nmi_enter() * because the INT3 could have been hit in any context including * NMI. */ if (user_mode(regs)) { - idtentry_enter(regs); + idtentry_enter_user(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); - idtentry_exit(regs); + idtentry_exit_user(regs); } else { nmi_enter(); instrumentation_begin(); @@ -877,7 +877,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { - idtentry_enter(regs); + idtentry_enter_user(regs); clear_thread_flag(TIF_BLOCKSTEP); /* @@ -886,7 +886,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, * User wants a sigtrap for that. */ handle_debug(regs, dr6, !dr6); - idtentry_exit(regs); + idtentry_exit_user(regs); } #ifdef CONFIG_X86_64 -- cgit From eb6555c83933ce8e094d5429d57970aaa9f0591e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:24 +0200 Subject: x86/entry/64: Move do_softirq_own_stack() to C The first step to get rid of the ENTER/LEAVE_IRQ_STACK ASM macro maze. Use the new C code helpers to move do_softirq_own_stack() out of ASM code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202117.870911120@linutronix.de --- arch/x86/kernel/irq_64.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b32ab009c19..1b4fe93a86c5 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -70,3 +71,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; return map_irq_stack(cpu); } + +void do_softirq_own_stack(void) +{ + run_on_irqstack_cond(__do_softirq, NULL, NULL); +} -- cgit From 91eeafea1e4b7c95cc4f38af186d7d48fceef89a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:28 +0200 Subject: x86/entry: Switch page fault exception to IDTENTRY_RAW Convert page fault exceptions to IDTENTRY_RAW: - Implement the C entry point with DEFINE_IDTENTRY_RAW - Add the CR2 read into the exception handler - Add the idtentry_enter/exit_cond_rcu() invocations in in the regular page fault handler and in the async PF part. - Emit the ASM stub with DECLARE_IDTENTRY_RAW - Remove the ASM idtentry in 64-bit - Remove the CR2 read from 64-bit - Remove the open coded ASM entry code in 32-bit - Fix up the XEN/PV code - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.238455120@linutronix.de --- arch/x86/kernel/idt.c | 4 ++-- arch/x86/kernel/kvm.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ec55479e1dd1..ddb11154aeee 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -62,7 +62,7 @@ static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), #ifdef CONFIG_X86_32 - INTG(X86_TRAP_PF, page_fault), + INTG(X86_TRAP_PF, asm_exc_page_fault), #endif }; @@ -156,7 +156,7 @@ static const __initconst struct idt_data apic_idts[] = { * stacks work only after cpu_init(). */ static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, page_fault), + INTG(X86_TRAP_PF, asm_exc_page_fault), }; /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index d6f22a3a1f7d..d00f7c430e65 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -218,7 +218,7 @@ again: } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_apf_flags(void) +noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; @@ -230,11 +230,11 @@ u32 kvm_read_and_reset_apf_flags(void) return flags; } EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); -bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { u32 reason = kvm_read_and_reset_apf_flags(); + bool rcu_exit; switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: @@ -244,6 +244,9 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return false; } + rcu_exit = idtentry_enter_cond_rcu(regs); + instrumentation_begin(); + /* * If the host managed to inject an async #PF into an interrupt * disabled region, then die hard as this is not going to end well @@ -258,13 +261,13 @@ bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { - rcu_irq_enter(); kvm_async_pf_task_wake(token); - rcu_irq_exit(); } + + instrumentation_end(); + idtentry_exit_cond_rcu(regs, rcu_exit); return true; } -NOKPROBE_SYMBOL(__kvm_handle_async_pf); static void __init paravirt_ops_setup(void) { -- cgit From 79b9c183021ef3f5ca2d5168cd3fd442580eca09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:33 +0200 Subject: x86/irq: Use generic irq_regs implementation The only difference is the name of the per-CPU variable: irq_regs vs. __irq_regs, but the accessor functions are identical. Remove the pointless copy and use the generic variant. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.704169051@linutronix.de --- arch/x86/kernel/irq.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7965ff429c5..252065d32ab5 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -26,9 +26,6 @@ DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -DEFINE_PER_CPU(struct pt_regs *, irq_regs); -EXPORT_PER_CPU_SYMBOL(irq_regs); - atomic_t irq_err_count; /* -- cgit From 633260fa143bbed05e65dc557a492667dfdc45bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:34 +0200 Subject: x86/irq: Convey vector as argument and not in ptregs Device interrupts which go through do_IRQ() or the spurious interrupt handler have their separate entry code on 64 bit for no good reason. Both 32 and 64 bit transport the vector number through ORIG_[RE]AX in pt_regs. Further the vector number is forced to fit into an u8 and is complemented and offset by 0x80 so it's in the signed character range. Otherwise GAS would expand the pushq to a 5 byte instruction for any vector > 0x7F. Treat the vector number like an error code and hand it to the C function as argument. This allows to get rid of the extra entry code in a later step. Simplify the error code push magic by implementing the pushq imm8 via a '.byte 0x6a, vector' sequence so GAS is not able to screw it up. As the pushq imm8 is sign extending the resulting error code needs to be truncated to 8 bits in C code. Originally-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.796915981@linutronix.de --- arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++++++------ arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/irq.c | 14 +++++++++----- 3 files changed, 35 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4b1d31be50b4..6c2b807a7eae 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2120,15 +2120,29 @@ void __init register_lapic_address(unsigned long address) * Local APIC interrupts */ -/* - * This interrupt should _never_ happen with our APIC/SMP architecture +/** + * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors + * @regs: Pointer to pt_regs on stack + * @error_code: The vector number is in the lower 8 bits + * + * This is invoked from ASM entry code to catch all interrupts which + * trigger on an entry which is routed to the common_spurious idtentry + * point. + * + * Also called from smp_spurious_apic_interrupt(). */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) +__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, + unsigned long vector) { - u8 vector = ~regs->orig_ax; u32 v; entering_irq(); + /* + * The push in the entry ASM code which stores the vector number on + * the stack in the error code slot is sign expanding. Just use the + * lower 8 bits. + */ + vector &= 0xFF; trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2149,11 +2163,11 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); if (v & (1 << (vector & 0x1f))) { - pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n", vector, smp_processor_id()); ack_APIC_irq(); } else { - pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n", vector, smp_processor_id()); } out: @@ -2161,6 +2175,11 @@ out: exiting_irq(); } +__visible void smp_spurious_apic_interrupt(struct pt_regs *regs) +{ + smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); +} + /* * This interrupt should never happen with our APIC/SMP architecture */ diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index ddb11154aeee..20408e31c18d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -145,7 +145,7 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_UV INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), #endif - INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt), + INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 252065d32ab5..c7669363251a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -227,14 +227,18 @@ u64 arch_irq_stat(void) * SMP cross-CPU interrupts have their own specific * handlers). */ -__visible void __irq_entry do_IRQ(struct pt_regs *regs) +__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) { struct pt_regs *old_regs = set_irq_regs(regs); - struct irq_desc * desc; - /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_ax; + struct irq_desc *desc; entering_irq(); + /* + * The push in the entry ASM code which stores the vector number on + * the stack in the error code slot is sign expanding. Just use the + * lower 8 bits. + */ + vector &= 0xFF; /* entering_irq() tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); @@ -249,7 +253,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs) ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { -- cgit From 7c2a57364cae0f2e070a27d728f1df6844ffff56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:35 +0200 Subject: x86/irq: Rework handle_irq() for 64-bit To consolidate the interrupt entry/exit code vs. the other exceptions make handle_irq() an inline and handle both 64-bit and 32-bit mode. Preparatory change to move irq stack switching for 64-bit to C which allows to consolidate the entry exit handling by reusing the idtentry machinery both in ASM and C. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202118.889972748@linutronix.de --- arch/x86/kernel/irq.c | 11 ++++++++++- arch/x86/kernel/irq_32.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c7669363251a..5495ea4debba 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -221,6 +222,14 @@ u64 arch_irq_stat(void) return sum; } +static __always_inline void handle_irq(struct irq_desc *desc, + struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_X86_64)) + run_on_irqstack_cond(desc->handle_irq, desc, regs); + else + __handle_irq(desc, regs); +} /* * do_IRQ handles all normal device IRQ's (the special @@ -246,7 +255,7 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { if (IS_ENABLED(CONFIG_X86_32)) - handle_irq(desc, regs); + __handle_irq(desc, regs); else generic_handle_irq_desc(desc); } else { diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a759ca97cd01..0b79efc87be5 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -148,7 +148,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } -void handle_irq(struct irq_desc *desc, struct pt_regs *regs) +void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { int overflow = check_stack_overflow(); -- cgit From fa5e5c409213265da8a188b4a5e4e641b1382eb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:37 +0200 Subject: x86/entry: Use idtentry for interrupts Replace the extra interrupt handling code and reuse the existing idtentry machinery. This moves the irq stack switching on 64-bit from ASM to C code; 32-bit already does the stack switching in C. This requires to remove HAVE_IRQ_EXIT_ON_IRQ_STACK as the stack switch is not longer in the low level entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.078690991@linutronix.de --- arch/x86/kernel/apic/apic.c | 23 ++++++++--------------- arch/x86/kernel/apic/msi.c | 3 ++- arch/x86/kernel/irq.c | 27 +++++++-------------------- 3 files changed, 17 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 6c2b807a7eae..b7bfd3a1abb7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2121,9 +2121,9 @@ void __init register_lapic_address(unsigned long address) */ /** - * smp_spurious_interrupt - Catch all for interrupts raised on unused vectors + * spurious_interrupt - Catch all for interrupts raised on unused vectors * @regs: Pointer to pt_regs on stack - * @error_code: The vector number is in the lower 8 bits + * @vector: The vector number * * This is invoked from ASM entry code to catch all interrupts which * trigger on an entry which is routed to the common_spurious idtentry @@ -2131,18 +2131,10 @@ void __init register_lapic_address(unsigned long address) * * Also called from smp_spurious_apic_interrupt(). */ -__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, - unsigned long vector) +DEFINE_IDTENTRY_IRQ(spurious_interrupt) { u32 v; - entering_irq(); - /* - * The push in the entry ASM code which stores the vector number on - * the stack in the error code slot is sign expanding. Just use the - * lower 8 bits. - */ - vector &= 0xFF; trace_spurious_apic_entry(vector); inc_irq_stat(irq_spurious_count); @@ -2163,21 +2155,22 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs, */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); if (v & (1 << (vector & 0x1f))) { - pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Acked\n", + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", vector, smp_processor_id()); ack_APIC_irq(); } else { - pr_info("Spurious interrupt (vector 0x%02lx) on CPU#%d. Not pending!\n", + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", vector, smp_processor_id()); } out: trace_spurious_apic_exit(vector); - exiting_irq(); } __visible void smp_spurious_apic_interrupt(struct pt_regs *regs) { - smp_spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + entering_irq(); + __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); + exiting_irq(); } /* diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 159bd0cb8548..5cbaca58af95 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -115,7 +115,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * denote it as spurious which is no harm as this is a rare event * and interrupt handlers have to cope with spurious interrupts * anyway. If the vector is unused, then it is marked so it won't - * trigger the 'No irq handler for vector' warning in do_IRQ(). + * trigger the 'No irq handler for vector' warning in + * common_interrupt(). * * This requires to hold vector lock to prevent concurrent updates to * the affected vector. diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 5495ea4debba..c449b8434036 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -20,6 +20,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -232,37 +233,25 @@ static __always_inline void handle_irq(struct irq_desc *desc, } /* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). */ -__visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) +DEFINE_IDTENTRY_IRQ(common_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - entering_irq(); - /* - * The push in the entry ASM code which stores the vector number on - * the stack in the error code slot is sign expanding. Just use the - * lower 8 bits. - */ - vector &= 0xFF; - - /* entering_irq() tells RCU that we're not quiescent. Check it. */ + /* entry code tells RCU that we're not quiescent. Check it. */ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { - if (IS_ENABLED(CONFIG_X86_32)) - __handle_irq(desc, regs); - else - generic_handle_irq_desc(desc); + handle_irq(desc, regs); } else { ack_APIC_irq(); if (desc == VECTOR_UNUSED) { - pr_emerg_ratelimited("%s: %d.%lu No irq handler for vector\n", + pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), vector); } else { @@ -270,8 +259,6 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs, unsigned long vector) } } - exiting_irq(); - set_irq_regs(old_regs); } -- cgit From db0338eec5836eea3bd1b274212234d04bac2034 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:39 +0200 Subject: x86/entry: Convert APIC interrupts to IDTENTRY_SYSVEC Convert APIC interrupts to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.280728850@linutronix.de --- arch/x86/kernel/apic/apic.c | 23 +++++------------------ arch/x86/kernel/idt.c | 8 ++++---- arch/x86/kernel/irq.c | 5 ++--- 3 files changed, 11 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bfd3a1abb7..9244377ed454 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1088,23 +1088,14 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ -__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt) { struct pt_regs *old_regs = set_irq_regs(regs); - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - * - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - entering_ack_irq(); + ack_APIC_irq(); trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); trace_local_timer_exit(LOCAL_TIMER_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } @@ -2129,7 +2120,7 @@ void __init register_lapic_address(unsigned long address) * trigger on an entry which is routed to the common_spurious idtentry * point. * - * Also called from smp_spurious_apic_interrupt(). + * Also called from sysvec_spurious_apic_interrupt(). */ DEFINE_IDTENTRY_IRQ(spurious_interrupt) { @@ -2166,17 +2157,15 @@ out: trace_spurious_apic_exit(vector); } -__visible void smp_spurious_apic_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt) { - entering_irq(); __spurious_interrupt(regs, SPURIOUS_APIC_VECTOR); - exiting_irq(); } /* * This interrupt should never happen with our APIC/SMP architecture */ -__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) { static const char * const error_interrupt_reason[] = { "Send CS error", /* APIC Error Bit 0 */ @@ -2190,7 +2179,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) }; u32 v, i = 0; - entering_irq(); trace_error_apic_entry(ERROR_APIC_VECTOR); /* First tickle the hardware, only then report what went on. -- REW */ @@ -2214,7 +2202,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs) apic_printk(APIC_DEBUG, KERN_CONT "\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); - exiting_irq(); } /** diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 20408e31c18d..93c1b27f40f4 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -132,8 +132,8 @@ static const __initconst struct idt_data apic_idts[] = { #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), @@ -145,8 +145,8 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_UV INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), #endif - INTG(SPURIOUS_APIC_VECTOR, spurious_apic_interrupt), - INTG(ERROR_APIC_VECTOR, error_interrupt), + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c449b8434036..7e3005274f83 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -268,17 +268,16 @@ void (*x86_platform_ipi_callback)(void) = NULL; /* * Handler for X86_PLATFORM_IPI_VECTOR. */ -__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_ack_irq(); + ack_APIC_irq(); trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) x86_platform_ipi_callback(); trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); - exiting_irq(); set_irq_regs(old_regs); } #endif -- cgit From 582f9191231b994582ad5349a7b06b3255c926fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:40 +0200 Subject: x86/entry: Convert SMP system vectors to IDTENTRY_SYSVEC Convert SMP system vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.372234635@linutronix.de --- arch/x86/kernel/apic/vector.c | 5 ++--- arch/x86/kernel/idt.c | 10 +++++----- arch/x86/kernel/smp.c | 18 +++++++----------- 3 files changed, 14 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 67768e54438b..c48be6e1f676 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -861,13 +861,13 @@ static void free_moved_vector(struct apic_chip_data *apicd) apicd->move_in_progress = 0; } -asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup) { struct hlist_head *clhead = this_cpu_ptr(&cleanup_list); struct apic_chip_data *apicd; struct hlist_node *tmp; - entering_ack_irq(); + ack_APIC_irq(); /* Prevent vectors vanishing under us */ raw_spin_lock(&vector_lock); @@ -892,7 +892,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void) } raw_spin_unlock(&vector_lock); - exiting_irq(); } static void __send_cleanup_vector(struct apic_chip_data *apicd) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 93c1b27f40f4..018a5424b574 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -112,11 +112,11 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), - INTG(CALL_FUNCTION_VECTOR, call_function_interrupt), - INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt), - INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt), - INTG(REBOOT_VECTOR, reboot_interrupt), + INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), + INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), + INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), + INTG(REBOOT_VECTOR, asm_sysvec_reboot), #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index b8d4e9c3c070..e5647daa7e96 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) /* * this function calls the 'stop' function on all other CPUs in the system. */ - -asmlinkage __visible void smp_reboot_interrupt(void) +DEFINE_IDTENTRY_SYSVEC(sysvec_reboot) { - ipi_entering_ack_irq(); + ack_APIC_irq(); cpu_emergency_vmxoff(); stop_this_cpu(NULL); - irq_exit(); } static int register_stop_handler(void) @@ -227,7 +226,6 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); - kvm_set_cpu_l1tf_flush_l1d(); if (trace_resched_ipi_enabled()) { /* @@ -244,24 +242,22 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) scheduler_ipi(); } -__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_interrupt(); trace_call_function_exit(CALL_FUNCTION_VECTOR); - exiting_irq(); } -__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r) +DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); generic_smp_call_function_single_interrupt(); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); - exiting_irq(); } static int __init nonmi_ipi_setup(char *str) -- cgit From 720909a7abd351535bfb485a0ecce03c2e4467e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:41 +0200 Subject: x86/entry: Convert various system vectors Convert various system vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.464812973@linutronix.de --- arch/x86/kernel/cpu/mce/amd.c | 5 ++--- arch/x86/kernel/cpu/mce/therm_throt.c | 5 ++--- arch/x86/kernel/cpu/mce/threshold.c | 5 ++--- arch/x86/kernel/idt.c | 28 ++++++++++++++-------------- arch/x86/kernel/irq_work.c | 6 +++--- 5 files changed, 23 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 52de616a8065..a906d68a18a2 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -907,14 +907,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); } -asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) { - entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); inc_irq_stat(irq_deferred_error_count); deferred_error_int_vector(); trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index f36dc0742085..a7cd2d203ced 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void) static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_thermal) { - entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); trace_thermal_apic_exit(THERMAL_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 28812cc15300..6a059a035021 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -21,12 +21,11 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_threshold) { - entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); - exiting_ack_irq(); + ack_APIC_irq(); } diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 018a5424b574..3d811d058f2e 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -120,33 +120,33 @@ static const __initconst struct idt_data apic_idts[] = { #endif #ifdef CONFIG_X86_THERMAL_VECTOR - INTG(THERMAL_APIC_VECTOR, thermal_interrupt), + INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal), #endif #ifdef CONFIG_X86_MCE_THRESHOLD - INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt), + INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold), #endif #ifdef CONFIG_X86_MCE_AMD - INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt), + INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error), #endif #ifdef CONFIG_X86_LOCAL_APIC - INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), - INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), + INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), + INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK - INTG(IRQ_WORK_VECTOR, irq_work_interrupt), + INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif -#ifdef CONFIG_X86_UV - INTG(UV_BAU_MESSAGE, uv_bau_message_intr1), -#endif - INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), - INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_UV + INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message), +# endif + INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), + INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), #endif }; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index 80bee7695a20..890d4778cd35 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -9,18 +9,18 @@ #include #include #include +#include #include #include #ifdef CONFIG_X86_LOCAL_APIC -__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work) { - ipi_entering_ack_irq(); + ack_APIC_irq(); trace_irq_work_entry(IRQ_WORK_VECTOR); inc_irq_stat(apic_irq_work_irqs); irq_work_run(); trace_irq_work_exit(IRQ_WORK_VECTOR); - exiting_irq(); } void arch_irq_work_raise(void) -- cgit From 9c3b1f4975c46fc2932fd6d53e63c14f0ddf985f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:42 +0200 Subject: x86/entry: Convert KVM vectors to IDTENTRY_SYSVEC* Convert KVM specific system vectors to IDTENTRY_SYSVEC*: The two empty stub handlers which only increment the stats counter do no need to run on the interrupt stack. Use IDTENTRY_SYSVEC_SIMPLE for them. The wakeup handler does more work and runs on the interrupt stack. None of these handlers need to save and restore the irq_regs pointer. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Paolo Bonzini Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.555715519@linutronix.de --- arch/x86/kernel/idt.c | 6 +++--- arch/x86/kernel/irq.c | 24 ++++++------------------ 2 files changed, 9 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 3d811d058f2e..faaadd430882 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -135,9 +135,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), # ifdef CONFIG_HAVE_KVM - INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), - INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), - INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), + INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), + INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), + INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7e3005274f83..181060247e3c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -298,41 +298,29 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler); /* * Handler for POSTED_INTERRUPT_VECTOR. */ -__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_ipis); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ -__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_wakeup_ipis); kvm_posted_intr_wakeup_handler(); - exiting_irq(); - set_irq_regs(old_regs); } /* * Handler for POSTED_INTERRUPT_NESTED_VECTOR. */ -__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) { - struct pt_regs *old_regs = set_irq_regs(regs); - - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(kvm_posted_intr_nested_ipis); - exiting_irq(); - set_irq_regs(old_regs); } #endif -- cgit From a16be368dd3fb695077cc9bc59c988b548955eec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:43 +0200 Subject: x86/entry: Convert various hypervisor vectors to IDTENTRY_SYSVEC Convert various hypervisor vectors to IDTENTRY_SYSVEC: - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC - Remove the ASM idtentries in 64-bit - Remove the BUILD_INTERRUPT entries in 32-bit - Remove the old prototypes No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Reviewed-by: Wei Liu Link: https://lore.kernel.org/r/20200521202119.647997594@linutronix.de --- arch/x86/kernel/cpu/acrn.c | 9 ++++----- arch/x86/kernel/cpu/mshyperv.c | 22 ++++++++++------------ 2 files changed, 14 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c index 676022e71791..1da9b1c9a2db 100644 --- a/arch/x86/kernel/cpu/acrn.c +++ b/arch/x86/kernel/cpu/acrn.c @@ -10,10 +10,10 @@ */ #include -#include #include #include #include +#include #include static uint32_t __init acrn_detect(void) @@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void) static void __init acrn_init_platform(void) { /* Setup the IDT for ACRN hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback); } static bool acrn_x2apic_available(void) @@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void) static void (*acrn_intr_handler)(void); -__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) * will block the interrupt whose vector is lower than * HYPERVISOR_CALLBACK_VECTOR. */ - entering_ack_irq(); + ack_APIC_irq(); inc_irq_stat(irq_hv_callback_count); if (acrn_intr_handler) acrn_intr_handler(); - exiting_irq(); set_irq_regs(old_regs); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index ebf34c7bc8bc..af94f05a5c66 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); static void (*hv_crash_handler)(struct pt_regs *regs); -__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(irq_hv_callback_count); if (vmbus_handler) vmbus_handler(); @@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs) if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); * Routines to do per-architecture handling of stimer0 * interrupts when in Direct Mode */ - -__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); - entering_irq(); inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); - exiting_irq(); set_irq_regs(old_regs); } @@ -331,17 +327,19 @@ static void __init ms_hyperv_init_platform(void) x86_platform.apic_post_init = hyperv_init; hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback); /* Setup the IDT for reenlightenment notifications */ - if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) { alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, - hyperv_reenlightenment_vector); + asm_sysvec_hyperv_reenlightenment); + } /* Setup the IDT for stimer0 */ - if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) { alloc_intr_gate(HYPERV_STIMER0_VECTOR, - hv_stimer0_callback_vector); + asm_sysvec_hyperv_stimer0); + } # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; -- cgit From 13cad9851ef1d004640991d45227dd35c08f45fc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:45 +0200 Subject: x86/entry: Convert reschedule interrupt to IDTENTRY_SYSVEC_SIMPLE The scheduler IPI does not need the full interrupt entry handling logic when the entry is from kernel mode. Use IDTENTRY_SYSVEC_SIMPLE and spare all the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.835425642@linutronix.de --- arch/x86/kernel/idt.c | 2 +- arch/x86/kernel/smp.c | 19 ++++--------------- arch/x86/kernel/tracepoint.c | 17 ----------------- 3 files changed, 5 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index faaadd430882..bc9b0d1d7bb8 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -112,7 +112,7 @@ static const __initconst struct idt_data def_idts[] = { */ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_SMP - INTG(RESCHEDULE_VECTOR, reschedule_interrupt), + INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi), INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function), INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single), INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup), diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index e5647daa7e96..eff4ce3b10da 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -220,26 +220,15 @@ static void native_stop_other_cpus(int wait) /* * Reschedule call back. KVM uses this interrupt to force a cpu out of - * guest mode + * guest mode. */ -__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi) { ack_APIC_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); inc_irq_stat(irq_resched_count); - - if (trace_resched_ipi_enabled()) { - /* - * scheduler_ipi() might call irq_enter() as well, but - * nested calls are fine. - */ - irq_enter(); - trace_reschedule_entry(RESCHEDULE_VECTOR); - scheduler_ipi(); - trace_reschedule_exit(RESCHEDULE_VECTOR); - irq_exit(); - return; - } scheduler_ipi(); + trace_reschedule_exit(RESCHEDULE_VECTOR); } DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 496748ed266a..fcfc077afe2d 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -25,20 +25,3 @@ void trace_pagefault_unreg(void) { static_branch_dec(&trace_pagefault_key); } - -#ifdef CONFIG_SMP - -DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key); - -int trace_resched_ipi_reg(void) -{ - static_branch_inc(&trace_resched_ipi_key); - return 0; -} - -void trace_resched_ipi_unreg(void) -{ - static_branch_dec(&trace_resched_ipi_key); -} - -#endif -- cgit From 75da04f7f3cb416a68475e040175dc013da32de2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:46 +0200 Subject: x86/entry: Remove the apic/BUILD interrupt leftovers Remove all the code which was there to emit the system vector stubs. All users are gone. Move the now unused GET_CR2_INTO macro muck to head_64.S where the last user is. Fixup the eye hurting comment there while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202119.927433002@linutronix.de --- arch/x86/kernel/head_64.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4fc33fdf0f16..16da4ac01597 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -29,15 +29,16 @@ #ifdef CONFIG_PARAVIRT_XXL #include #include +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg #else #define INTERRUPT_RETURN iretq +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg #endif -/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE +/* + * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. - * */ - #define l4_index(x) (((x) >> 39) & 511) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -- cgit From 3ffdfdcec1bae39b68b990762350b3cd3127f23f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 May 2020 22:05:51 +0200 Subject: x86/entry: Move paranoid irq tracing out of ASM code The last step to remove the irq tracing cruft from ASM. Ignore #DF as the maschine is going to die anyway. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20200521202120.414043330@linutronix.de --- arch/x86/kernel/cpu/mce/core.c | 3 +++ arch/x86/kernel/nmi.c | 3 +++ arch/x86/kernel/traps.c | 11 +++++++++++ 3 files changed, 17 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index c47f004f6231..068e6cab1286 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1922,7 +1922,10 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); + trace_hardirqs_off_prepare(); machine_check_vector(regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); nmi_exit(); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3052c78f03aa..5df4e7f58369 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,6 +330,7 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); + trace_hardirqs_off_prepare(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -416,6 +417,8 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f28be3e51cca..50fb9cd5be97 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -634,8 +634,11 @@ DEFINE_IDTENTRY_RAW(exc_int3) } else { nmi_enter(); instrumentation_begin(); + trace_hardirqs_off_prepare(); if (!do_int3(regs)) die("int3", regs, 0); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); instrumentation_end(); nmi_exit(); } @@ -850,6 +853,10 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_prepare(); + instrumentation_end(); + /* * The SDM says "The processor clears the BTF flag when it * generates a debug exception." Clear TIF_BLOCKSTEP to keep @@ -871,6 +878,10 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if (dr6) handle_debug(regs, dr6, false); + instrumentation_begin(); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); nmi_exit(); } -- cgit From d390e6de89d30402bd06056c40cea72328aec9b1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:29 +0200 Subject: x86/hw_breakpoint: Add within_area() to check data breakpoints Add a within_area() helper to checking whether the data breakpoints overlap with cpu_entry_area. It will be used to completely prevent data breakpoints on GDT, IDT, or TSS. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-2-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.784524504@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 9ddf441ccaa8..c149c7b29ac3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -227,14 +227,23 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw) return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX); } +/* + * Checks whether the range [addr, end], overlaps the area [base, base + size). + */ +static inline bool within_area(unsigned long addr, unsigned long end, + unsigned long base, unsigned long size) +{ + return end >= base && addr < (base + size); +} + /* * Checks whether the range from addr to end, inclusive, overlaps the CPU * entry area range. */ static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) { - return end >= CPU_ENTRY_AREA_BASE && - addr < (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_TOTAL_SIZE); + return within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE); } static int arch_build_bp_info(struct perf_event *bp, -- cgit From 97417cb9ad4ed052d7a4c5c0d75db1ff1b0981fb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:30 +0200 Subject: x86/hw_breakpoint: Prevent data breakpoints on direct GDT A data breakpoint on the GDT can be fatal and must be avoided. The GDT in the CPU entry area is already protected, but not the direct GDT. Add the necessary protection. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-3-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.840953950@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index c149c7b29ac3..f859095c1b6c 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,6 +32,7 @@ #include #include #include +#include /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -237,13 +238,26 @@ static inline bool within_area(unsigned long addr, unsigned long end, } /* - * Checks whether the range from addr to end, inclusive, overlaps the CPU - * entry area range. + * Checks whether the range from addr to end, inclusive, overlaps the fixed + * mapped CPU entry area range or other ranges used for CPU entry. */ -static inline bool within_cpu_entry_area(unsigned long addr, unsigned long end) +static inline bool within_cpu_entry(unsigned long addr, unsigned long end) { - return within_area(addr, end, CPU_ENTRY_AREA_BASE, - CPU_ENTRY_AREA_TOTAL_SIZE); + int cpu; + + /* CPU entry erea is always used for CPU entry */ + if (within_area(addr, end, CPU_ENTRY_AREA_BASE, + CPU_ENTRY_AREA_TOTAL_SIZE)) + return true; + + for_each_possible_cpu(cpu) { + /* The original rw GDT is being used after load_direct_gdt() */ + if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), + GDT_SIZE)) + return true; + } + + return false; } static int arch_build_bp_info(struct perf_event *bp, @@ -257,12 +271,12 @@ static int arch_build_bp_info(struct perf_event *bp, return -EINVAL; /* - * Prevent any breakpoint of any type that overlaps the - * cpu_entry_area. This protects the IST stacks and also + * Prevent any breakpoint of any type that overlaps the CPU + * entry area and data. This protects the IST stacks and also * reduces the chance that we ever find out what happens if * there's a data breakpoint on the GDT, IDT, or TSS. */ - if (within_cpu_entry_area(attr->bp_addr, bp_end)) + if (within_cpu_entry(attr->bp_addr, bp_end)) return -EINVAL; hw->address = attr->bp_addr; -- cgit From f9fe0b89f05441c6e4034e024c2c75a0d93024c1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:31 +0200 Subject: x86/hw_breakpoint: Prevent data breakpoints on per_cpu cpu_tss_rw cpu_tss_rw is not directly referenced by hardware, but cpu_tss_rw is accessed in CPU entry code, especially when #DB shifts its stacks. If a data breakpoint would be set on cpu_tss_rw.x86_tss.ist[IST_INDEX_DB], it would cause recursive #DB ending up in a double fault. Add it to the list of protected items. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-4-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.897976479@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f859095c1b6c..f311bbfda1ba 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -255,6 +255,15 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu), GDT_SIZE)) return true; + + /* + * cpu_tss_rw is not directly referenced by hardware, but + * cpu_tss_rw is also used in CPU entry code, + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tss_rw, cpu), + sizeof(struct tss_struct))) + return true; } return false; -- cgit From fdef24dfccb7be06e6ebe11d6c6c56987421870f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 29 May 2020 23:27:32 +0200 Subject: x86/hw_breakpoint: Prevent data breakpoints on user_pcid_flush_mask The per-CPU user_pcid_flush_mask is used in the low level entry code. A data breakpoint can cause #DB recursion. Protect the full cpu_tlbstate structure for simplicity. Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200526014221.2119-5-laijs@linux.alibaba.com Link: https://lkml.kernel.org/r/20200529213320.955117574@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f311bbfda1ba..fc1743a2b0e9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -33,6 +33,7 @@ #include #include #include +#include /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); @@ -264,6 +265,16 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end) (unsigned long)&per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct))) return true; + + /* + * cpu_tlbstate.user_pcid_flush_mask is used for CPU entry. + * If a data breakpoint on it, it will cause an unwanted #DB. + * Protect the full cpu_tlbstate structure to be sure. + */ + if (within_area(addr, end, + (unsigned long)&per_cpu(cpu_tlbstate, cpu), + sizeof(struct tlb_state))) + return true; } return false; -- cgit From e1de11d4d1a64ac1b90b9833f1a3629dae18facb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:33 +0200 Subject: x86/entry: Introduce local_db_{save,restore}() In order to allow other exceptions than #DB to disable breakpoints, provide common helpers. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.012060983@infradead.org --- arch/x86/kernel/traps.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 50fb9cd5be97..bcb9dd961c6d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -727,15 +727,7 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) * Entry text is excluded for HW_BP_X and cpu_entry_area, which * includes the entry stack is excluded for everything. */ - get_debugreg(*dr7, 7); - set_debugreg(0, 7); - - /* - * Ensure the compiler doesn't lower the above statements into - * the critical section; disabling breakpoints late would not - * be good. - */ - barrier(); + *dr7 = local_db_save(); /* * The Intel SDM says: @@ -756,13 +748,7 @@ static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7) static __always_inline void debug_exit(unsigned long dr7) { - /* - * Ensure the compiler doesn't raise this statement into - * the critical section; enabling breakpoints early would - * not be good. - */ - barrier(); - set_debugreg(dr7, 7); + local_db_restore(dr7); } /* -- cgit From fd338e3564b0b8597a89f83941a0eda3e5092cc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:34 +0200 Subject: x86/entry, nmi: Disable #DB Instead of playing stupid games with IST stacks, fully disallow #DB during NMIs. There is absolutely no reason to allow them, and killing this saves a heap of trouble. #DB is already forbidden on noinstr and CEA, so there can't be a #DB before this. Disabling it right after nmi_enter() ensures that the full NMI code is protected. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.069223695@infradead.org --- arch/x86/kernel/nmi.c | 55 +++------------------------------------------------ 1 file changed, 3 insertions(+), 52 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 5df4e7f58369..873a8c040b86 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -474,40 +474,7 @@ enum nmi_states { }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); - -#ifdef CONFIG_X86_64 -/* - * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without - * some care, the inner breakpoint will clobber the outer breakpoint's - * stack. - * - * If a breakpoint is being processed, and the debug stack is being - * used, if an NMI comes in and also hits a breakpoint, the stack - * pointer will be set to the same fixed address as the breakpoint that - * was interrupted, causing that stack to be corrupted. To handle this - * case, check if the stack that was interrupted is the debug stack, and - * if so, change the IDT so that new breakpoints will use the current - * stack and not switch to the fixed address. On return of the NMI, - * switch back to the original IDT. - */ -static DEFINE_PER_CPU(int, update_debug_stack); - -static noinstr bool is_debug_stack(unsigned long addr) -{ - struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); - unsigned long top = CEA_ESTACK_TOP(cs, DB); - unsigned long bot = CEA_ESTACK_BOT(cs, DB1); - - if (__this_cpu_read(debug_stack_usage)) - return true; - /* - * Note, this covers the guard page between DB and DB1 as well to - * avoid two checks. But by all means @addr can never point into - * the guard page. - */ - return addr >= bot && addr < top; -} -#endif +static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_NMI(exc_nmi) { @@ -522,18 +489,7 @@ DEFINE_IDTENTRY_NMI(exc_nmi) this_cpu_write(nmi_cr2, read_cr2()); nmi_restart: -#ifdef CONFIG_X86_64 - /* - * If we interrupted a breakpoint, it is possible that - * the nmi handler will have breakpoints too. We need to - * change the IDT such that breakpoints that happen here - * continue to use the NMI stack. - */ - if (unlikely(is_debug_stack(regs->sp))) { - debug_stack_set_zero(); - this_cpu_write(update_debug_stack, 1); - } -#endif + this_cpu_write(nmi_dr7, local_db_save()); nmi_enter(); @@ -544,12 +500,7 @@ nmi_restart: nmi_exit(); -#ifdef CONFIG_X86_64 - if (unlikely(this_cpu_read(update_debug_stack))) { - debug_stack_reset(); - this_cpu_write(update_debug_stack, 0); - } -#endif + local_db_restore(this_cpu_read(nmi_dr7)); if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) write_cr2(this_cpu_read(nmi_cr2)); -- cgit From cd840e424f27fcc1ae8d14b7ec3ec4560ee6561a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:35 +0200 Subject: x86/entry, mce: Disallow #DB during #MC #MC is fragile as heck, don't tempt fate. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.131187767@infradead.org --- arch/x86/kernel/cpu/mce/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 068e6cab1286..be499267bbb4 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1943,22 +1943,34 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs) /* MCE hit kernel mode */ DEFINE_IDTENTRY_MCE(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); exc_machine_check_kernel(regs); + local_db_restore(dr7); } /* The user mode variant. */ DEFINE_IDTENTRY_MCE_USER(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); exc_machine_check_user(regs); + local_db_restore(dr7); } #else /* 32bit unified entry point */ DEFINE_IDTENTRY_MCE(exc_machine_check) { + unsigned long dr7; + + dr7 = local_db_save(); if (user_mode(regs)) exc_machine_check_user(regs); else exc_machine_check_kernel(regs); + local_db_restore(dr7); } #endif -- cgit From 84b6a3491567a540f955e18d8e615493afa36df0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:36 +0200 Subject: x86/entry: Optimize local_db_save() for virt Because DRn access is 'difficult' with virt; but the DR7 read is cheaper than a cacheline miss on native, add a virt specific fast path to local_db_save(), such that when breakpoints are not in use to avoid touching DRn entirely. Suggested-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.187833200@infradead.org --- arch/x86/kernel/hw_breakpoint.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index fc1743a2b0e9..8cdf29ffd95f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -99,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp) unsigned long *dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -117,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp) dr7 = this_cpu_ptr(&cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); + /* + * Ensure we first write cpu_dr7 before we set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + set_debugreg(*dr7, 7); if (info->mask) set_dr_addr_mask(info->mask, i); @@ -136,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp) void arch_uninstall_hw_breakpoint(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); - unsigned long *dr7; + unsigned long dr7; int i; + lockdep_assert_irqs_disabled(); + for (i = 0; i < HBP_NUM; i++) { struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); @@ -151,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = this_cpu_ptr(&cpu_dr7); - *dr7 &= ~__encode_dr7(i, info->len, info->type); + dr7 = this_cpu_read(cpu_dr7); + dr7 &= ~__encode_dr7(i, info->len, info->type); - set_debugreg(*dr7, 7); + set_debugreg(dr7, 7); if (info->mask) set_dr_addr_mask(0, i); + + /* + * Ensure the write to cpu_dr7 is after we've set the DR7 register. + * This ensures an NMI never see cpu_dr7 0 when DR7 is not. + */ + barrier(); + + this_cpu_write(cpu_dr7, dr7); } static int arch_bp_generic_len(int x86_len) -- cgit From f9912ada82862df341b3e86864cbd532d0d24b84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:37 +0200 Subject: x86/entry: Remove debug IDT frobbing This is all unused now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.245019500@infradead.org --- arch/x86/kernel/cpu/common.c | 17 ----------------- arch/x86/kernel/idt.c | 30 ------------------------------ arch/x86/kernel/traps.c | 9 --------- 3 files changed, 56 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f4645f9ff9cb..043d93cdcaad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1706,23 +1706,6 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -DEFINE_PER_CPU(int, debug_stack_usage); -DEFINE_PER_CPU(u32, debug_idt_ctr); - -noinstr void debug_stack_set_zero(void) -{ - this_cpu_inc(debug_idt_ctr); - load_current_idt(); -} - -noinstr void debug_stack_reset(void) -{ - if (WARN_ON(!this_cpu_read(debug_idt_ctr))) - return; - if (this_cpu_dec_return(debug_idt_ctr) == 0) - load_current_idt(); -} - #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index bc9b0d1d7bb8..226c99229886 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,14 +158,6 @@ static const __initconst struct idt_data apic_idts[] = { static const __initconst struct idt_data early_pf_idts[] = { INTG(X86_TRAP_PF, asm_exc_page_fault), }; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -static const __initconst struct idt_data dbg_idts[] = { - INTG(X86_TRAP_DB, asm_exc_debug), -}; #endif /* Must be page-aligned because the real IDT is used in a fixmap. */ @@ -177,9 +169,6 @@ struct desc_ptr idt_descr __ro_after_init = { }; #ifdef CONFIG_X86_64 -/* No need to be aligned, but done to keep all IDTs defined the same way. */ -gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; - /* * The exceptions which use Interrupt stacks. They are setup after * cpu_init() when the TSS has been initialized. @@ -192,15 +181,6 @@ static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), #endif }; - -/* - * Override for the debug_idt. Same as the default, but with interrupt - * stack set to DEFAULT_STACK (0). Required for NMI trap handling. - */ -const struct desc_ptr debug_idt_descr = { - .size = IDT_ENTRIES * 16 - 1, - .address = (unsigned long) debug_idt_table, -}; #endif static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) @@ -292,16 +272,6 @@ void __init idt_setup_ist_traps(void) { idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true); } - -/** - * idt_setup_debugidt_traps - Initialize the debug idt table with debug traps - */ -void __init idt_setup_debugidt_traps(void) -{ - memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); - - idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false); -} #endif /** diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bcb9dd961c6d..6f887be1ac0c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -798,12 +798,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, return; } - /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. - */ - debug_stack_usage_inc(); - /* It's safe to allow irq's after DR6 has been saved */ cond_local_irq_enable(regs); @@ -831,7 +825,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); - debug_stack_usage_dec(); instrumentation_end(); } @@ -1077,6 +1070,4 @@ void __init trap_init(void) cpu_init(); idt_setup_ist_traps(); - - idt_setup_debugidt_traps(); } -- cgit From fd501d4f0399700011acde486576c7c1eb8e7a61 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:38 +0200 Subject: x86/entry: Remove DBn stacks Both #DB itself, as all other IST users (NMI, #MC) now clear DR7 on entry. Combined with not allowing breakpoints on entry/noinstr/NOKPROBE text and no single step (EFLAGS.TF) inside the #DB handler should guarantee no nested #DB. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.303027161@infradead.org --- arch/x86/kernel/asm-offsets_64.c | 3 --- arch/x86/kernel/dumpstack_64.c | 7 ++----- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index c2a47016f243..828be792231e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -57,9 +57,6 @@ int main(void) BLANK(); #undef ENTRY - OFFSET(TSS_ist, tss_struct, x86_tss.ist); - DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - - offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 460ae7f66818..4a94d38cd141 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -22,15 +22,13 @@ static const char * const exception_stack_names[] = { [ ESTACK_DF ] = "#DF", [ ESTACK_NMI ] = "NMI", - [ ESTACK_DB2 ] = "#DB2", - [ ESTACK_DB1 ] = "#DB1", [ ESTACK_DB ] = "#DB", [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -79,7 +77,6 @@ static const struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { EPAGERANGE(DF), EPAGERANGE(NMI), - EPAGERANGE(DB1), EPAGERANGE(DB), EPAGERANGE(MCE), }; @@ -91,7 +88,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) struct pt_regs *regs; unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); /* -- cgit From bf2b3008440072068580c609d79a079656af0588 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2020 23:27:40 +0200 Subject: x86/entry: Rename trace_hardirqs_off_prepare() The typical pattern for trace_hardirqs_off_prepare() is: ENTRY lockdep_hardirqs_off(); // because hardware ... do entry magic instrumentation_begin(); trace_hardirqs_off_prepare(); ... do actual work trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); ... do exit magic lockdep_hardirqs_on(); which shows that it's named wrong, rename it to trace_hardirqs_off_finish(), as it concludes the hardirq_off transition. Also, given that the above is the only correct order, make the traditional all-in-one trace_hardirqs_off() follow suit. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200529213321.415774872@infradead.org --- arch/x86/kernel/cpu/mce/core.c | 2 +- arch/x86/kernel/nmi.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index be499267bbb4..b9cb381b4019 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1922,7 +1922,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) * that out because it's an indirect call. Annotate it. */ instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); machine_check_vector(regs); if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 873a8c040b86..3a98ff36f411 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,7 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6f887be1ac0c..79af913e78a3 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -634,7 +634,7 @@ DEFINE_IDTENTRY_RAW(exc_int3) } else { nmi_enter(); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); if (regs->flags & X86_EFLAGS_IF) @@ -833,7 +833,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, { nmi_enter(); instrumentation_begin(); - trace_hardirqs_off_prepare(); + trace_hardirqs_off_finish(); instrumentation_end(); /* -- cgit From bdf5bde8aec7b53d0ea3a44d880a4e5106ff37f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:16 +0200 Subject: x86/idt: Mark init only functions __init Since 8175cfbbbfcb ("x86/idt: Remove update_intr_gate()") set_intr_gate() and idt_setup_from_table() are only called from __init functions. Mark them as well. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.715816477@linutronix.de --- arch/x86/kernel/idt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 226c99229886..4b99f7bec384 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -197,7 +197,7 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) #endif } -static void +static __init void idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys) { gate_desc desc; @@ -210,7 +210,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy } } -static void set_intr_gate(unsigned int n, const void *addr) +static __init void set_intr_gate(unsigned int n, const void *addr) { struct idt_data data; -- cgit From 94438af40d06c110988fc9e30baf801f38b1491a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:17 +0200 Subject: x86/idt: Add comments about early #PF handling The difference between 32 and 64 bit vs. early #PF handling is not documented. Replace the FIXME at idt_setup_early_pf() with proper comments. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.807135882@linutronix.de --- arch/x86/kernel/idt.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 4b99f7bec384..5ef82fcf333d 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -61,7 +61,11 @@ static bool idt_setup_done __initdata; static const __initconst struct idt_data early_idts[] = { INTG(X86_TRAP_DB, asm_exc_debug), SYSG(X86_TRAP_BP, asm_exc_int3), + #ifdef CONFIG_X86_32 + /* + * Not possible on 64-bit. See idt_setup_early_pf() for details. + */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif }; @@ -256,8 +260,10 @@ void __init idt_setup_traps(void) * cpu_init() is invoked and sets up TSS. The IST variant is installed * after that. * - * FIXME: Why is 32bit and 64bit installing the PF handler at different - * places in the early setup code? + * Note, that X86_64 cannot install the real #PF handler in + * idt_setup_early_traps() because the memory intialization needs the #PF + * handler from the early_idt_handler_array to initialize the early page + * tables. */ void __init idt_setup_early_pf(void) { -- cgit From 5a2bafca1b0675a126143eea3610143130347783 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:18 +0200 Subject: x86/idt: Use proper constants for table size Use the actual struct size to calculate the IDT table size instead of hardcoded values. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.898591501@linutronix.de --- arch/x86/kernel/idt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 5ef82fcf333d..b6e1a87f0822 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -51,6 +51,7 @@ struct idt_data { #define TSKG(_vector, _gdt) \ G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3) +#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc)) static bool idt_setup_done __initdata; @@ -168,7 +169,7 @@ static const __initconst struct idt_data early_pf_idts[] = { gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; struct desc_ptr idt_descr __ro_after_init = { - .size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1, + .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; -- cgit From 00229a54300108502f68c8777faca2d13f805f1a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:19 +0200 Subject: x86/idt: Cleanup trap_init() No point in having all the IDT cruft in trap_init(). Move it into the IDT code and fixup the comments. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145522.992376498@linutronix.de --- arch/x86/kernel/idt.c | 18 ++++++++++++++++++ arch/x86/kernel/traps.c | 9 --------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index b6e1a87f0822..902cdd006313 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -281,6 +282,19 @@ void __init idt_setup_ist_traps(void) } #endif +static void __init idt_map_in_cea(void) +{ + /* + * Set the IDT descriptor to a fixed read-only location in the cpu + * entry area, so that the "sidt" instruction will not leak the + * location of the kernel, and to defend the IDT against arbitrary + * memory write vulnerabilities. + */ + cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), + PAGE_KERNEL_RO); + idt_descr.address = CPU_ENTRY_AREA_RO_IDT; +} + /** * idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates */ @@ -307,6 +321,10 @@ void __init idt_setup_apic_and_irq_gates(void) set_intr_gate(i, entry); } #endif + /* Map IDT into CPU entry area and reload it. */ + idt_map_in_cea(); + load_idt(&idt_descr); + idt_setup_done = true; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 79af913e78a3..5566fe50ef98 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1055,15 +1055,6 @@ void __init trap_init(void) idt_setup_traps(); - /* - * Set the IDT descriptor to a fixed read-only location, so that the - * "sidt" instruction will not leak the location of the kernel, and - * to defend the IDT against arbitrary memory write vulnerabilities. - * It will be reloaded in cpu_init() */ - cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), - PAGE_KERNEL_RO); - idt_descr.address = CPU_ENTRY_AREA_RO_IDT; - /* * Should be a barrier for any external CPU state: */ -- cgit From 3e77abda65b1cec10ef6b18b1ccfee0beaf400f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 28 May 2020 16:53:20 +0200 Subject: x86/idt: Consolidate idt functionality - Move load_current_idt() out of line and replace the hideous comment with a lockdep assert. This allows to make idt_table and idt_descr static. - Mark idt_table read only after the IDT initialization is complete. - Shuffle code around to consolidate the #ifdef sections into one. - Adapt the F00F bug code. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200528145523.084915381@linutronix.de --- arch/x86/kernel/idt.c | 63 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 902cdd006313..0db21206f2f3 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -156,37 +157,25 @@ static const __initconst struct idt_data apic_idts[] = { #endif }; -#ifdef CONFIG_X86_64 -/* - * Early traps running on the DEFAULT_STACK because the other interrupt - * stacks work only after cpu_init(). - */ -static const __initconst struct idt_data early_pf_idts[] = { - INTG(X86_TRAP_PF, asm_exc_page_fault), -}; -#endif - -/* Must be page-aligned because the real IDT is used in a fixmap. */ -gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; +/* Must be page-aligned because the real IDT is used in the cpu entry area */ +static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; struct desc_ptr idt_descr __ro_after_init = { .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; -#ifdef CONFIG_X86_64 -/* - * The exceptions which use Interrupt stacks. They are setup after - * cpu_init() when the TSS has been initialized. - */ -static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), - ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), - ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), -#ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), -#endif -}; +void load_current_idt(void) +{ + lockdep_assert_irqs_disabled(); + load_idt(&idt_descr); +} + +#ifdef CONFIG_X86_F00F_BUG +bool idt_is_f00f_address(unsigned long address) +{ + return ((address - idt_descr.address) >> 3) == 6; +} #endif static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) @@ -255,6 +244,27 @@ void __init idt_setup_traps(void) } #ifdef CONFIG_X86_64 +/* + * Early traps running on the DEFAULT_STACK because the other interrupt + * stacks work only after cpu_init(). + */ +static const __initconst struct idt_data early_pf_idts[] = { + INTG(X86_TRAP_PF, asm_exc_page_fault), +}; + +/* + * The exceptions which use Interrupt stacks. They are setup after + * cpu_init() when the TSS has been initialized. + */ +static const __initconst struct idt_data ist_idts[] = { + ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF), +#ifdef CONFIG_X86_MCE + ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE), +#endif +}; + /** * idt_setup_early_pf - Initialize the idt table with early pagefault handler * @@ -325,6 +335,9 @@ void __init idt_setup_apic_and_irq_gates(void) idt_map_in_cea(); load_idt(&idt_descr); + /* Make the IDT table read only */ + set_memory_ro((unsigned long)&idt_table, 1); + idt_setup_done = true; } -- cgit From 5ef227933117085f1320b3421ef43a26bf624b4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 13:40:20 +0200 Subject: x86/entry: Re-order #DB handler to avoid *SAN instrumentation vmlinux.o: warning: objtool: exc_debug()+0xbb: call to clear_ti_thread_flag.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: noist_exc_debug()+0x55: call to clear_ti_thread_flag.constprop.0() leaves .noinstr.text section Rework things so that handle_debug() looses the noinstr and move the clear_thread_flag() into that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200603114052.127756554@infradead.org --- arch/x86/kernel/traps.c | 55 ++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5566fe50ef98..7febae381b91 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -775,26 +775,44 @@ static __always_inline void debug_exit(unsigned long dr7) * * May run on IST stack. */ -static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, - bool user_icebp) +static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user) { struct task_struct *tsk = current; + bool user_icebp; int si_code; + /* + * The SDM says "The processor clears the BTF flag when it + * generates a debug exception." Clear TIF_BLOCKSTEP to keep + * TIF_BLOCKSTEP in sync with the hardware BTF flag. + */ + clear_thread_flag(TIF_BLOCKSTEP); + + /* + * If DR6 is zero, no point in trying to handle it. The kernel is + * not using INT1. + */ + if (!user && !dr6) + return; + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + user_icebp = user && !dr6; + /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - instrumentation_begin(); #ifdef CONFIG_KPROBES if (kprobe_debug_handler(regs)) { - instrumentation_end(); return; } #endif if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0, SIGTRAP) == NOTIFY_STOP) { - instrumentation_end(); return; } @@ -825,7 +843,6 @@ static void noinstr handle_debug(struct pt_regs *regs, unsigned long dr6, out: cond_local_irq_disable(regs); - instrumentation_end(); } static __always_inline void exc_debug_kernel(struct pt_regs *regs, @@ -834,14 +851,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); - instrumentation_end(); - - /* - * The SDM says "The processor clears the BTF flag when it - * generates a debug exception." Clear TIF_BLOCKSTEP to keep - * TIF_BLOCKSTEP in sync with the hardware BTF flag. - */ - clear_thread_flag(TIF_BLOCKSTEP); /* * Catch SYSENTER with TF set and clear DR_STEP. If this hit a @@ -850,14 +859,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs)) dr6 &= ~DR_STEP; - /* - * If DR6 is zero, no point in trying to handle it. The kernel is - * not using INT1. - */ - if (dr6) - handle_debug(regs, dr6, false); + handle_debug(regs, dr6, false); - instrumentation_begin(); if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); @@ -868,14 +871,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, unsigned long dr6) { idtentry_enter_user(regs); - clear_thread_flag(TIF_BLOCKSTEP); + instrumentation_begin(); - /* - * If dr6 has no reason to give us about the origin of this trap, - * then it's very likely the result of an icebp/int01 trap. - * User wants a sigtrap for that. - */ - handle_debug(regs, dr6, !dr6); + handle_debug(regs, dr6, true); + instrumentation_end(); idtentry_exit_user(regs); } -- cgit From f0178fc01fe46bab6a95415f5647d1a74efcad1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 Jun 2020 08:37:01 +0200 Subject: x86/entry: Unbreak __irqentry_text_start/end magic The entry rework moved interrupt entry code from the irqentry to the noinstr section which made the irqentry section empty. This breaks boundary checks which rely on the __irqentry_text_start/end markers to find out whether a function in a stack trace is interrupt/exception entry code. This affects the function graph tracer and filter_irq_stacks(). As the IDT entry points are all sequentialy emitted this is rather simple to unbreak by injecting __irqentry_text_start/end as global labels. To make this work correctly: - Remove the IRQENTRY_TEXT section from the x86 linker script - Define __irqentry so it breaks the build if it's used - Adjust the entry mirroring in PTI - Remove the redundant kprobes and unwinder bound checks Reported-by: Qian Cai Signed-off-by: Thomas Gleixner --- arch/x86/kernel/kprobes/core.c | 7 ------- arch/x86/kernel/kprobes/opt.c | 4 +--- arch/x86/kernel/unwind_frame.c | 8 +------- arch/x86/kernel/vmlinux.lds.S | 1 - 4 files changed, 2 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 85de8fa69b24..3bafe1bd4dc7 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1073,13 +1073,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); int __init arch_populate_kprobe_blacklist(void) { - int ret; - - ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, - (unsigned long)__irqentry_text_end); - if (ret) - return ret; - return kprobe_add_area_blacklist((unsigned long)__entry_text_start, (unsigned long)__entry_text_end); } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 234f58e0fe8c..321c19950285 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -286,9 +286,7 @@ static int can_optimize(unsigned long paddr) * stack handling and registers setup. */ if (((paddr >= (unsigned long)__entry_text_start) && - (paddr < (unsigned long)__entry_text_end)) || - ((paddr >= (unsigned long)__irqentry_text_start) && - (paddr < (unsigned long)__irqentry_text_end))) + (paddr < (unsigned long)__entry_text_end))) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 54226110bc7f..722a85f3b2dd 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; - if (addr >= __entry_text_start && addr < __entry_text_end) - return true; - - if (addr >= __irqentry_text_start && addr < __irqentry_text_end) - return true; - - return false; + return addr >= __entry_text_start && addr < __entry_text_end; } static inline unsigned long *last_frame(struct unwind_state *state) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1bf7e312361f..b4c6b6f35548 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -134,7 +134,6 @@ SECTIONS KPROBES_TEXT ALIGN_ENTRY_TEXT_BEGIN ENTRY_TEXT - IRQENTRY_TEXT ALIGN_ENTRY_TEXT_END SOFTIRQENTRY_TEXT *(.fixup) -- cgit From 17fae1294ad9d711b2c3dd0edef479d40c76a5e8 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 20 May 2020 09:35:46 -0700 Subject: x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned An interesting thing happened when a guest Linux instance took a machine check. The VMM unmapped the bad page from guest physical space and passed the machine check to the guest. Linux took all the normal actions to offline the page from the process that was using it. But then guest Linux crashed because it said there was a second machine check inside the kernel with this stack trace: do_memory_failure set_mce_nospec set_memory_uc _set_memory_uc change_page_attr_set_clr cpa_flush clflush_cache_range_opt This was odd, because a CLFLUSH instruction shouldn't raise a machine check (it isn't consuming the data). Further investigation showed that the VMM had passed in another machine check because is appeared that the guest was accessing the bad page. Fix is to check the scope of the poison by checking the MCi_MISC register. If the entire page is affected, then unmap the page. If only part of the page is affected, then mark the page as uncacheable. This assumes that VMMs will do the logical thing and pass in the "whole page scope" via the MCi_MISC register (since they unmapped the entire page). [ bp: Adjust to x86/entry changes. ] Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()") Reported-by: Jue Wang Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Jue Wang Cc: Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel.com --- arch/x86/kernel/cpu/mce/core.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 30413325de22..ce9120c4f740 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); +static bool whole_page(struct mce *m) +{ + if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV)) + return true; + + return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT; +} + bool mce_is_correctable(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) @@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val, pfn = mce->addr >> PAGE_SHIFT; if (!memory_failure(pfn, 0)) { - set_mce_nospec(pfn); + set_mce_nospec(pfn, whole_page(mce)); mce->kflags |= MCE_HANDLED_UC; } @@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb) int flags = MF_ACTION_REQUIRED; pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); - if (!(p->mce_status & MCG_STATUS_RIPV)) + + if (!p->mce_ripv) flags |= MF_MUST_KILL; if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { - set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page); return; } @@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs) BUG_ON(!on_thread_stack() || !user_mode(regs)); current->mce_addr = m.addr; - current->mce_status = m.mcgstatus; + current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV); + current->mce_whole_page = whole_page(&m); current->mce_kill_me.func = kill_me_maybe; if (kill_it) current->mce_kill_me.func = kill_me_now; -- cgit From 7ccddc4613db446dc3cbb69a3763ba60ec651d13 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 27 May 2020 11:28:08 -0700 Subject: x86/mce/dev-mcelog: Fix -Wstringop-truncation warning about strncpy() The kbuild test robot reported this warning: arch/x86/kernel/cpu/mce/dev-mcelog.c: In function 'dev_mcelog_init_device': arch/x86/kernel/cpu/mce/dev-mcelog.c:346:2: warning: 'strncpy' output \ truncated before terminating nul copying 12 bytes from a string of the \ same length [-Wstringop-truncation] This is accurate, but I don't care that the trailing NUL character isn't copied. The string being copied is just a magic number signature so that crash dump tools can be sure they are decoding the right blob of memory. Use memcpy() instead of strncpy(). Fixes: d8ecca4043f2 ("x86/mce/dev-mcelog: Dynamically allocate space for machine check records") Reported-by: kbuild test robot Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200527182808.27737-1-tony.luck@intel.com --- arch/x86/kernel/cpu/mce/dev-mcelog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index a4fd5287f02f..43c466020ed5 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -349,7 +349,7 @@ static __init int dev_mcelog_init_device(void) if (!mcelog) return -ENOMEM; - strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); + memcpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature)); mcelog->len = mce_log_len; mcelog->recordlen = sizeof(struct mce); -- cgit From 15a416e8aaa758b5534f64a3972dae05275bc225 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Jun 2020 20:26:38 -0700 Subject: x86/entry: Treat BUG/WARN as NMI-like entries BUG/WARN are cleverly optimized using UD2 to handle the BUG/WARN out of line in an exception fixup. But if BUG or WARN is issued in a funny RCU context, then the idtentry_enter...() path might helpfully WARN that the RCU context is invalid, which results in infinite recursion. Split the BUG/WARN handling into an nmi_enter()/nmi_exit() path in exc_invalid_op() to increase the chance to survive the experience. [ tglx: Make the declaration match the implementation ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/f8fe40e0088749734b4435b554f73eee53dcf7a8.1591932307.git.luto@kernel.org --- arch/x86/kernel/traps.c | 64 +++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7febae381b91..af75109485c2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -97,24 +97,6 @@ int is_valid_bugaddr(unsigned long addr) return ud == INSN_UD0 || ud == INSN_UD2; } -int fixup_bug(struct pt_regs *regs, int trapnr) -{ - if (trapnr != X86_TRAP_UD) - return 0; - - switch (report_bug(regs->ip, regs)) { - case BUG_TRAP_TYPE_NONE: - case BUG_TRAP_TYPE_BUG: - break; - - case BUG_TRAP_TYPE_WARN: - regs->ip += LEN_UD2; - return 1; - } - - return 0; -} - static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -190,13 +172,6 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, { RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - /* - * WARN*()s end up here; fix them up before we call the - * notifier chain. - */ - if (!user_mode(regs) && fixup_bug(regs, trapnr)) - return; - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { cond_local_irq_enable(regs); @@ -241,9 +216,46 @@ static inline void handle_invalid_op(struct pt_regs *regs) ILL_ILLOPN, error_get_trap_addr(regs)); } -DEFINE_IDTENTRY(exc_invalid_op) +DEFINE_IDTENTRY_RAW(exc_invalid_op) { + bool rcu_exit; + + /* + * Handle BUG/WARN like NMIs instead of like normal idtentries: + * if we bugged/warned in a bad RCU context, for example, the last + * thing we want is to BUG/WARN again in the idtentry code, ad + * infinitum. + */ + if (!user_mode(regs) && is_valid_bugaddr(regs->ip)) { + enum bug_trap_type type; + + nmi_enter(); + instrumentation_begin(); + trace_hardirqs_off_finish(); + type = report_bug(regs->ip, regs); + if (regs->flags & X86_EFLAGS_IF) + trace_hardirqs_on_prepare(); + instrumentation_end(); + nmi_exit(); + + if (type == BUG_TRAP_TYPE_WARN) { + /* Skip the ud2. */ + regs->ip += LEN_UD2; + return; + } + + /* + * Else, if this was a BUG and report_bug returns or if this + * was just a normal #UD, we want to continue onward and + * crash. + */ + } + + rcu_exit = idtentry_enter_cond_rcu(regs); + instrumentation_begin(); handle_invalid_op(regs); + instrumentation_end(); + idtentry_exit_cond_rcu(regs, rcu_exit); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) -- cgit From 71ed49d8fb33023f242419a77ecb1141c029cac4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Jun 2020 14:02:27 +0200 Subject: x86/entry: Make NMI use IDTENTRY_RAW For no reason other than beginning brainmelt, IDTENTRY_NMI was mapped to IDTENTRY_IST. This is not a problem on 64bit because the IST default entry point maps to IDTENTRY_RAW which does not any entry handling. The surplus function declaration for the noist C entry point is unused and as there is no ASM code emitted for NMI this went unnoticed. On 32bit IDTENTRY_IST maps to a regular IDTENTRY which does the normal entry handling. That is clearly the wrong thing to do for NMI. Map it to IDTENTRY_RAW to unbreak it. The IDTENTRY_NMI mapping needs to stay to avoid emitting ASM code. Fixes: 6271fef00b34 ("x86/entry: Convert NMI to IDTENTRY_NMI") Reported-by: Naresh Kamboju Debugged-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/CA+G9fYvF3cyrY+-iw_SZtpN-i2qA2BruHg4M=QYECU2-dNdsMw@mail.gmail.com --- arch/x86/kernel/nmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 3a98ff36f411..2de365f15684 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -476,7 +476,7 @@ static DEFINE_PER_CPU(enum nmi_states, nmi_state); static DEFINE_PER_CPU(unsigned long, nmi_cr2); static DEFINE_PER_CPU(unsigned long, nmi_dr7); -DEFINE_IDTENTRY_NMI(exc_nmi) +DEFINE_IDTENTRY_RAW(exc_nmi) { if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) return; -- cgit