From 1dedefd1a066a795a87afca9c0236e1a94de9bf6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:23 -0700 Subject: x86: detect scattered cpuid features earlier Some extra CPU features such as ARAT is needed in early boot so that x86_init function pointers can be set up properly. http://lkml.org/lkml/2010/5/18/519 At start_kernel() level, this patch moves init_scattered_cpuid_features() from check_bugs() to setup_arch() -> early_cpu_init() which is earlier than platform specific x86_init layer setup. Suggested by HPA. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-2-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1c00d0b1692..284bf89ddae2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } -- cgit From a0c173bd8a3fd0541be8e4ef962170e48d8811c7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:24 -0700 Subject: x86, mrst: add cpu type detection Medfield is the follow-up of Moorestown, it is treated under the same HW sub-architecture. However, we do need to know the CPU type in order for some of the driver to act accordingly. We also have different optimal clock configuration for each CPU type. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 19 +++++++++++++++++++ arch/x86/kernel/mrst.c | 26 ++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 451d30e7f62d..dc5c8500bfc7 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,8 +11,27 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +extern int mrst_identify_cpu(void); int __init sfi_parse_mrtc(struct sfi_table_header *table); +/* + * Medfield is the follow-up of Moorestown, it combines two chip solution into + * one. Other than that it also added always-on and constant tsc and lapic + * timers. Medfield is the platform name, and the chip name is called Penwell + * we treat Medfield/Penwell as a variant of Moorestown. Penwell can be + * identified via MSRs. + */ +enum mrst_cpu_type { + MRST_CPU_CHIP_LINCROFT = 1, + MRST_CPU_CHIP_PENWELL, +}; + +enum mrst_timer_options { + MRST_TIMER_DEFAULT, + MRST_TIMER_APBT_ONLY, + MRST_TIMER_LAPIC_APBT, +}; + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index e796448f0eb5..ceaebeb5866f 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -27,6 +27,8 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +static int mrst_cpu_chip; + int sfi_mtimer_num; struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; @@ -216,6 +218,28 @@ static void __init mrst_setup_boot_clock(void) setup_boot_APIC_clock(); }; +int mrst_identify_cpu(void) +{ + return mrst_cpu_chip; +} +EXPORT_SYMBOL_GPL(mrst_identify_cpu); + +void __cpuinit mrst_arch_setup(void) +{ + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + else { + pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + } + pr_debug("Moorestown CPU %s identified\n", + (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + "Lincroft" : "Penwell"); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -230,6 +254,8 @@ void __init x86_mrst_early_setup(void) x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.oem.arch_setup = mrst_arch_setup; + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; -- cgit From a875c01944f0d750eeb1ef3133feceb13f13c4b3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:25 -0700 Subject: x86, mrst: add more timer config options Always-on local APIC timer (ARAT) has been introduced to Medfield, along with the platform APB timers we have more timer configuration options between Moorestown and Medfield. This patch adds run-time detection of avaiable timer features so that we can treat Medfield as a variant of Moorestown and set up the optimal timer options for each platform. i.e. Medfield: per cpu always-on local APIC timer Moorestown: per cpu APB timer Manual override is possible via cmdline option x86_mrst_timer. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/apb_timer.h | 1 - arch/x86/include/asm/mrst.h | 1 + arch/x86/kernel/apb_timer.c | 37 ++++------------- arch/x86/kernel/mrst.c | 88 ++++++++++++++++++++++++++++------------ 4 files changed, 72 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index c74a2eebe570..a69b1ac9eaf8 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -55,7 +55,6 @@ extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); extern unsigned int boot_cpu_id; -extern int disable_apbt_percpu; extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index dc5c8500bfc7..67ad31545778 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -12,6 +12,7 @@ #define _ASM_X86_MRST_H extern int pci_mrst_init(void); extern int mrst_identify_cpu(void); +extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); /* diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,10 +43,11 @@ #include #include +#include #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 #define APBT_MIN_DELTA_USEC 200 @@ -83,8 +84,6 @@ struct apbt_dev { char name[10]; }; -int disable_apbt_percpu __cpuinitdata; - static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); #ifdef CONFIG_SMP @@ -194,29 +193,6 @@ static struct clock_event_device apbt_clockevent = { .rating = APBT_CLOCKEVENT_RATING, }; -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - /* * start count down from 0xffff_ffff. this is done by toggling the enable bit * then load initial load count to ~0. @@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", @@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu || !apb_timer_block_enabled) + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); @@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, int timer_num; struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + BUG_ON(!apbt_virt_address); + timer_num = adev->num; pr_debug("%s CPU %d timer %d mode=%d\n", __func__, first_cpu(*evt->cpumask), timer_num, mode); @@ -676,7 +655,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index ceaebeb5866f..636b53bd4198 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,6 +25,29 @@ #include #include +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_mrst_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +int mrst_timer_options __cpuinitdata; + static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; static int mrst_cpu_chip; @@ -169,18 +192,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } -/* - * the secondary clock in Moorestown can be APBT or LAPIC clock, default to - * APBT but cmdline option can also override it. - */ -static void __cpuinit mrst_setup_secondary_clock(void) -{ - /* restore default lapic clock if disabled by cmdline */ - if (disable_apbt_percpu) - return setup_secondary_APIC_clock(); - apbt_setup_secondary_clock(); -} - static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; @@ -197,6 +208,21 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + switch (mrst_timer_options) { + case MRST_TIMER_APBT_ONLY: + break; + case MRST_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); @@ -207,17 +233,6 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -/* - * if we use per cpu apb timer, the bootclock already setup. if we use lapic - * timer and one apbt timer for broadcast, we need to set up lapic boot clock. - */ -static void __init mrst_setup_boot_clock(void) -{ - pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); - if (disable_apbt_percpu) - setup_boot_APIC_clock(); -}; - int mrst_identify_cpu(void) { return mrst_cpu_chip; @@ -250,13 +265,13 @@ void __init x86_mrst_early_setup(void) x86_init.resources.reserve_resources = x86_init_noop; x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.oem.arch_setup = mrst_arch_setup; - x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; @@ -269,3 +284,26 @@ void __init x86_mrst_early_setup(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; } + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + mrst_timer_options = MRST_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + mrst_timer_options = MRST_TIMER_LAPIC_APBT; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); -- cgit From a75af580bb1fd261bf63cc00e4b324e17ceb15cf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 13:40:14 -0700 Subject: x86, mrst: make mrst_identify_cpu() an inline returning enum We have an enum, might as well use it. While we're at it, make it an inline... there is really no point in calling a function for this stuff. LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Cc: Jacob Pan --- arch/x86/include/asm/mrst.h | 7 ++++++- arch/x86/kernel/mrst.c | 17 ++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 67ad31545778..1869c18d15ca 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,6 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); -extern int mrst_identify_cpu(void); extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); @@ -27,6 +26,12 @@ enum mrst_cpu_type { MRST_CPU_CHIP_PENWELL, }; +extern enum mrst_cpu_type __mrst_cpu_chip; +static enum mrst_cpu_type mrst_identify_cpu(void) +{ + return __mrst_cpu_chip; +} + enum mrst_timer_options { MRST_TIMER_DEFAULT, MRST_TIMER_APBT_ONLY, diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 636b53bd4198..967f2686adb0 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -50,7 +50,8 @@ int mrst_timer_options __cpuinitdata; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -static int mrst_cpu_chip; +enum mrst_cpu_type __mrst_cpu_chip; +EXPORT_SYMBOL_GPL(__mrst_cpu_chip); int sfi_mtimer_num; @@ -233,25 +234,19 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -int mrst_identify_cpu(void) -{ - return mrst_cpu_chip; -} -EXPORT_SYMBOL_GPL(mrst_identify_cpu); - void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; else { pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", boot_cpu_data.x86, boot_cpu_data.x86_model); - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; } pr_debug("Moorestown CPU %s identified\n", - (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? "Lincroft" : "Penwell"); } -- cgit From 14671386dcbafb3086bbda3cb6f9f27d34c7bf6d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 14:37:40 -0700 Subject: x86, mrst: make mrst_timer_options an enum We have an enum mrst_timer_options, use it so that the kernel knows if we're missing something from a switch statement or equivalent. Signed-off-by: H. Peter Anvin LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Cc: Thomas Gleixner Cc: Jacob Pan --- arch/x86/include/asm/mrst.h | 3 ++- arch/x86/kernel/mrst.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 1869c18d15ca..16350740edf6 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,6 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); -extern int mrst_timer_options __cpuinitdata; int __init sfi_parse_mrtc(struct sfi_table_header *table); /* @@ -38,6 +37,8 @@ enum mrst_timer_options { MRST_TIMER_LAPIC_APBT, }; +extern enum mrst_timer_options mrst_timer_options; + #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 967f2686adb0..7ee4ed901baf 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -46,7 +46,7 @@ * lapic (always-on,ARAT) ------ 150 */ -int mrst_timer_options __cpuinitdata; +__cpuinitdata enum mrst_timer_options mrst_timer_options; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -- cgit From c0011dbfce69467b23b08fb4a64c39a409a935fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 4 Feb 2010 14:46:34 -0800 Subject: xen: use _PAGE_IOMAP in ioremap to do machine mappings In a Xen domain, ioremap operates on machine addresses, not pseudo-physical addresses. We use _PAGE_IOMAP to determine whether a mapping is intended for machine addresses. [ Impact: allow Xen domain to map real hardware ] Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/xen/page.h | 8 ++--- arch/x86/xen/mmu.c | 71 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 018a0a400799..bf5f7d32bd08 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -112,13 +112,9 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) */ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) { - extern unsigned long max_mapnr; unsigned long pfn = mfn_to_pfn(mfn); - if ((pfn < max_mapnr) - && !xen_feature(XENFEAT_auto_translated_physmap) - && (get_phys_to_machine(pfn) != mfn)) - return max_mapnr; /* force !pfn_valid() */ - /* XXX fixme; not true with sparsemem */ + if (get_phys_to_machine(pfn) != mfn) + return -1; /* force !pfn_valid() */ return pfn; } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..a4dea9df0cc0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,9 +56,11 @@ #include #include +#include #include #include #include +#include #include #include "multicalls.h" @@ -377,6 +379,28 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } +static bool xen_iomap_pte(pte_t pte) +{ + return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP); +} + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + struct multicall_space mcs; + struct mmu_update *u; + + mcs = xen_mc_entry(sizeof(*u)); + u = mcs.args; + + /* ptep might be kmapped when using 32-bit HIGHPTE */ + u->ptr = arbitrary_virt_to_machine(ptep).maddr; + u->val = pte_val_ma(pteval); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -453,6 +477,11 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + if (xen_iomap_pte(pteval)) { + xen_set_iomap_pte(ptep, pteval); + goto out; + } + ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -523,8 +552,25 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) return val; } +static pteval_t iomap_pte(pteval_t val) +{ + if (val & _PAGE_PRESENT) { + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & PTE_FLAGS_MASK; + + /* We assume the pte frame number is a MFN, so + just use it as-is. */ + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; + } + + return val; +} + pteval_t xen_pte_val(pte_t pte) { + if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) + return pte.pte; + return pte_mfn_to_pfn(pte.pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -537,7 +583,11 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { - pte = pte_pfn_to_mfn(pte); + if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP))) + pte = iomap_pte(pte); + else + pte = pte_pfn_to_mfn(pte); + return native_make_pte(pte); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); @@ -593,6 +643,11 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + ADD_STATS(pte_update, 1); // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); @@ -609,6 +664,11 @@ void xen_set_pte(pte_t *ptep, pte_t pte) #ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { + if (xen_iomap_pte(pte)) { + xen_set_iomap_pte(ptep, pte); + return; + } + set_64bit((u64 *)ptep, native_pte_val(pte)); } @@ -1811,9 +1871,16 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) pte = pfn_pte(phys, prot); break; - default: + case FIX_PARAVIRT_BOOTMAP: + /* This is an MFN, but it isn't an IO mapping from the + IO domain */ pte = mfn_pte(phys, prot); break; + + default: + /* By default, set_fixmap is used for hardware mappings */ + pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP)); + break; } __native_set_fixmap(idx, pte); -- cgit From 7347b4082e55ac4a673f06a0a0ce25c37273c9ec Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 19 Feb 2010 13:31:06 -0500 Subject: xen: Allow unprivileged Xen domains to create iomap pages PV DomU domains are allowed to map hardware MFNs for PCI passthrough, but are not generally allowed to map raw machine pages. In particular, various pieces of code try to map DMI and ACPI tables in the ISA ROM range. We disallow _PAGE_IOMAP for those mappings, so that they are redirected to a set of local zeroed pages we reserve for that purpose. [ Impact: prevent passthrough of ISA space, as we only allow PCI ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ++++ arch/x86/xen/mmu.c | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a8..3254e8bc4cd7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1145,6 +1145,10 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; + if (!xen_initial_domain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + __supported_pte_mask |= _PAGE_IOMAP; /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a4dea9df0cc0..a5577f59416a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -381,7 +382,7 @@ static bool xen_page_pinned(void *ptr) static bool xen_iomap_pte(pte_t pte) { - return xen_initial_domain() && (pte_flags(pte) & _PAGE_IOMAP); + return pte_flags(pte) & _PAGE_IOMAP; } static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) @@ -583,10 +584,21 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { - if (unlikely(xen_initial_domain() && (pte & _PAGE_IOMAP))) + phys_addr_t addr = (pte & PTE_PFN_MASK); + + /* + * Unprivileged domains are allowed to do IOMAPpings for + * PCI passthrough, but not map ISA space. The ISA + * mappings are just dummy local mappings to keep other + * parts of the kernel happy. + */ + if (unlikely(pte & _PAGE_IOMAP) && + (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { pte = iomap_pte(pte); - else + } else { + pte &= ~_PAGE_IOMAP; pte = pte_pfn_to_mfn(pte); + } return native_make_pte(pte); } -- cgit From 19001c8c5bfa032ed45b10dfe48e355f5df88c61 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: xen: Rename the balloon lock * xen_create_contiguous_region needs access to the balloon lock to ensure memory doesn't change under its feet, so expose the balloon lock * Change the name of the lock to xen_reservation_lock, to imply it's now less-specific usage. [ Impact: cleanup ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a5577f59416a..9e0d82fc21e4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -70,6 +70,13 @@ #define MMU_UPDATE_HISTO 30 +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +DEFINE_SPINLOCK(xen_reservation_lock); + #ifdef CONFIG_XEN_DEBUG_FS static struct { -- cgit From 08bbc9da92f7e44b9c208c6a1adba70c403b255e Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: xen: Add xen_create_contiguous_region A memory region must be physically contiguous in order to be accessed through DMA. This patch adds xen_create_contiguous_region, which ensures a region of contiguous virtual memory is also physically contiguous. Based on Stephen Tweedie's port of the 2.6.18-xen version. Remove contiguous_bitmap[] as it's no longer needed. Ported from linux-2.6.18-xen.hg 707:e410857fd83c [ Impact: add Xen-internal API to make pages phys-contig ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 201 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9e0d82fc21e4..eb51402dd99a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -2027,6 +2028,206 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +/* Protected by xen_reservation_lock. */ +#define MAX_CONTIG_ORDER 9 /* 2MB */ +static unsigned long discontig_frames[1< MAX_CONTIG_ORDER)) + return -ENOMEM; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + vm_unmap_aliases(); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Zap current PTEs, remembering MFNs. */ + xen_zap_pfn_range(vstart, order, in_frames, NULL); + + /* 2. Get a new contiguous memory extent. */ + out_frame = virt_to_pfn(vstart); + success = xen_exchange_memory(1UL << order, 0, in_frames, + 1, order, &out_frame, + address_bits); + + /* 3. Map the new extent in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); + else + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + return success ? 0 : -ENOMEM; +} +EXPORT_SYMBOL_GPL(xen_create_contiguous_region); + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + unsigned long *out_frames = discontig_frames, in_frame; + unsigned long flags; + int success; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (unlikely(order > MAX_CONTIG_ORDER)) + return; + + memset((void *) vstart, 0, PAGE_SIZE << order); + + vm_unmap_aliases(); + + spin_lock_irqsave(&xen_reservation_lock, flags); + + /* 1. Find start MFN of contiguous extent. */ + in_frame = virt_to_mfn(vstart); + + /* 2. Zap current PTEs. */ + xen_zap_pfn_range(vstart, order, NULL, out_frames); + + /* 3. Do the exchange for non-contiguous MFNs. */ + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, + 0, out_frames, 0); + + /* 4. Map new pages in place of old pages. */ + if (success) + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); + else + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); + + spin_unlock_irqrestore(&xen_reservation_lock, flags); +} +EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; -- cgit From d6d4d4205cf4ce4ba13bc320305afbda25303496 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 3 Jun 2010 12:07:46 +0200 Subject: x86, xsave: Cleanup return codes in check_for_xstate() The places which call check_for_xstate() only care about zero or non-zero so this patch doesn't change how the code runs, but it's a cleanup. The main reason for this patch is that I'm looking for places which don't return -EFAULT for copy_from_user() failures. Signed-off-by: Dan Carpenter LKML-Reference: <20100603100746.GU5483@bicker> Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/kernel/xsave.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..980149867a19 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], sizeof(struct _fpx_sw_bytes)); - if (err) - return err; + return -EFAULT; /* * First Magic check failed. */ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -1; + return -EINVAL; /* * Check for error scenarios. @@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, if (fx_sw_user->xstate_size < min_xstate_size || fx_sw_user->xstate_size > xstate_size || fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -1; + return -EINVAL; err = __get_user(magic2, (__u32 *) (((void *)fpstate) + fx_sw_user->extended_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (err || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (magic2 != FP_XSTATE_MAGIC2) + return -EFAULT; return 0; } -- cgit From 8cc1176e5de534d55cb26ff0cef3fd0d6ad8c3c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 18:18:40 +0200 Subject: x86, cacheinfo: Carve out L3 cache slot accessors This is in preparation for disabling L3 cache indices after having received correctable ECCs in the L3 cache. Now we allow for initial setting of a disabled index slot (write once) and deny writing new indices to it after it has been disabled. Also, we deny using both slots to disable one and the same index. Userspace can restore the previously disabled indices by rewriting those sysfs entries when booting. Cleanup and reorganize code while at it. Signed-off-by: Borislav Petkov LKML-Reference: <20100602161840.GI18327@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 108 ++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae2062cf5..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) return l3; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, + int index) { int node; @@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3 = l3_caches[node]; } +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int slot) { - struct pci_dev *dev = this_leaf->l3->dev; - unsigned int reg = 0; + int index; if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (!dev) - return -EINVAL; + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); - pci_read_config_dword(dev, 0x1BC + slot * 4, ®); - return sprintf(buf, "0x%08x\n", reg); + return sprintf(buf, "FREE\n"); } #define SHOW_CACHE_DISABLE(slot) \ @@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, } } - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) { - struct pci_dev *dev = this_leaf->l3->dev; - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - unsigned long val = 0; + int ret = 0; #define SUBCACHE_MASK (3UL << 20) #define SUBCACHE_INDEX 0xfff - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + /* + * check whether this slot is already used or + * the index is already disabled + */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) return -EINVAL; + /* + * check whether the other slot has disabled the + * same index already + */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((index & SUBCACHE_INDEX) > l3->indices)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!dev) + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - amd_l3_disable_index(this_leaf->l3, cpu, slot, val); - + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } return count; } @@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, #else /* CONFIG_CPU_SUP_AMD */ static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; #endif /* CONFIG_CPU_SUP_AMD */ @@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(index, this_leaf); + amd_check_l3_disable(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } -- cgit From 12d8a961289644d265d8b3e88201878837c3b814 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 20:29:21 +0200 Subject: x86, AMD: Extend support to future families Extend support to future families, and in particular: * extend direct mapping split of Tseg SMM area. * extend K8 flavored alternatives (NOPS). * rep movs* prefix is fast in ucode. Signed-off-by: Borislav Petkov LKML-Reference: <20100602182921.GA21557@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..12b9cff047c1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* -- cgit From 1f9a0bd4989fd16842ad71fc89240b48ab191446 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:08 +0800 Subject: x86, mce: Rename MSR_IA32_MCx_CTL2 value Rename CMCI_EN to MCI_CTL2_CMCI_EN and CMCI_THRESHOLD_MASK to MCI_CTL2_CMCI_THRESHOLD_MASK to make naming consistent. Signed-off-by: Huang Ying LKML-Reference: <1275977348.3444.659.camel@yhuang-dev.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 4 ++++ arch/x86/include/asm/msr-index.h | 3 --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f32a4301c4d4..82db1d8f064b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,10 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +/* CTL2 register defines */ +#define MCI_CTL2_CMCI_EN (1ULL << 30) +#define MCI_CTL2_CMCI_THRESHOLD_MASK 0xffffULL + #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) #define MCJ_CTX_RANDOM 0 /* inject context: random */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b49d8ca228f6..38f66eb58541 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -94,9 +94,6 @@ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) -#define CMCI_EN (1ULL << 30) -#define CMCI_THRESHOLD_MASK 0xffffULL - #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..faf7b2919a87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,19 +95,19 @@ static void cmci_discover(int banks, int boot) rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); @@ -155,7 +155,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } -- cgit From 3c417588603e5411f29d22a40f3b5ff71529a4f0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:10 +0800 Subject: x86, mce: Fix MSR_IA32_MCI_CTL2 CMCI threshold setup It is reported that CMCI is not raised when number of corrected error reaches preset threshold. After inspection, it is found that MSR_IA32_MCI_CTL2 threshold field is not setup properly. This patch fixed it. Value of MCI_CTL2_CMCI_THRESHOLD_MASK is fixed according to x86_64 Software Developer's Manual too. Reported-by: Shaohui Zheng Signed-off-by: Huang Ying LKML-Reference: <1275977350.3444.660.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 82db1d8f064b..c62c13cb9788 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,7 +40,7 @@ /* CTL2 register defines */ #define MCI_CTL2_CMCI_EN (1ULL << 30) -#define MCI_CTL2_CMCI_THRESHOLD_MASK 0xffffULL +#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index faf7b2919a87..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -102,6 +102,7 @@ static void cmci_discover(int banks, int boot) continue; } + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); -- cgit From a2d7b0d4852536273b65d16fe179c65184fe5e2d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:35:39 +0800 Subject: x86, mce: Use HW_ERR in MCE handler Use HW_ERR printk prefix in MCE handler. To make it more explicit that this is hardware error instead of software error. Signed-off-by: Huang Ying LKML-Reference: <1275978939.3444.668.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..094b228c8b06 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); static int default_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); + pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); return NOTIFY_STOP; } @@ -211,11 +211,11 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -224,14 +224,14 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); /* @@ -241,16 +241,6 @@ static void print_mce(struct mce *m) atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); -} - #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; @@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; @@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) apei_err = apei_write_mce(final); } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -1220,7 +1208,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } -- cgit From ec8c27e04f89a7575ca2c4facb99152e03d6a99c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Apr 2010 06:45:36 -0700 Subject: mce: convert to rcu_dereference_index_check() The mce processing applies rcu_dereference_check() to integers used as array indices. This patch therefore moves mce to the new RCU API rcu_dereference_index_check() that avoids the sparse processing that would otherwise result in compiler errors. Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..0e78657e29c5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,7 +51,7 @@ static DEFINE_MUTEX(mce_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_check((p), \ + rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_read_mutex)) -- cgit From 421f91d21ad6f799dc7b489bb33cc560ccc56f98 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 11 Jun 2010 12:17:00 +0200 Subject: fix typos concerning "initiali[zs]e" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/head32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e01618..192cd7ee35cc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -459,7 +459,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) } /* - * Setup the local APIC timer for this CPU. Copy the initilized values + * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e246037392..784360c0625c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -20,7 +20,7 @@ static void __init i386_default_early_setup(void) { - /* Initilize 32bit specific setup functions */ + /* Initialize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; -- cgit From 23016bf0d25d62c45d8b8f61d55b290d704f7a79 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 3 Jun 2010 23:22:28 -0400 Subject: x86: Look for IA32_ENERGY_PERF_BIAS support The new IA32_ENERGY_PERF_BIAS MSR allows system software to give hardware a hint whether OS policy favors more power saving, or more performance. This allows the OS to have some influence on internal hardware power/performance tradeoffs where the OS has previously had no influence. The support for this feature is indicated by CPUID.06H.ECX.bit3, as documented in the Intel Architectures Software Developer's Manual. This patch discovers support of this feature and displays it as "epb" in /proc/cpuinfo. Signed-off-by: Venkatesh Pallipadi LKML-Reference: Signed-off-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 3 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 468145914389..2a904f4071fe 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -162,6 +162,7 @@ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ /* Virtualization flags: Linux defined */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b49d8ca228f6..e57bc20683d6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -238,6 +238,8 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 10fa5684a662..7369b4c2c55a 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, -- cgit From 8b8f79b927b6b302bb65fb8c56e7a19be5fbdbef Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 13 Jun 2010 23:56:54 +0200 Subject: x86, kmmio/mmiotrace: Fix double free of kmmio_fault_pages After every iounmap mmiotrace has to free kmmio_fault_pages, but it can't do it directly, so it defers freeing by RCU. It usually works, but when mmiotraced code calls ioremap-iounmap multiple times without sleeping between (so RCU won't kick in and start freeing) it can be given the same virtual address, so at every iounmap mmiotrace will schedule the same pages for release. Obviously it will explode on second free. Fix it by marking kmmio_fault_pages which are scheduled for release and not adding them second time. Signed-off-by: Marcin Slusarz Tested-by: Marcin Kocielnicki Tested-by: Shinpei KATO Acked-by: Pekka Paalanen Cc: Stuart Bennett Cc: Marcin Kocielnicki Cc: nouveau@lists.freedesktop.org Cc: LKML-Reference: <20100613215654.GA3829@joi.lan> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 16 +++++++++++++--- arch/x86/mm/testmmiotrace.c | 22 ++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 5d0e67fff1a6..e5d5e2ce9f77 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -45,6 +45,8 @@ struct kmmio_fault_page { * Protected by kmmio_lock, when linked into kmmio_page_table. */ int count; + + bool scheduled_for_release; }; struct kmmio_delayed_release { @@ -398,8 +400,11 @@ static void release_kmmio_fault_page(unsigned long page, BUG_ON(f->count < 0); if (!f->count) { disarm_kmmio_fault_page(f); - f->release_next = *release_list; - *release_list = f; + if (!f->scheduled_for_release) { + f->release_next = *release_list; + *release_list = f; + f->scheduled_for_release = true; + } } } @@ -471,8 +476,10 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) prevp = &f->release_next; } else { *prevp = f->release_next; + f->release_next = NULL; + f->scheduled_for_release = false; } - f = f->release_next; + f = *prevp; } spin_unlock_irqrestore(&kmmio_lock, flags); @@ -510,6 +517,9 @@ void unregister_kmmio_probe(struct kmmio_probe *p) kmmio_count--; spin_unlock_irqrestore(&kmmio_lock, flags); + if (!release_list) + return; + drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); if (!drelease) { pr_crit("leaking kmmio_fault_page objects.\n"); diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index 8565d944f7cf..38868adf07ea 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -90,6 +90,27 @@ static void do_test(unsigned long size) iounmap(p); } +/* + * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in + * a short time. We had a bug in deferred freeing procedure which tried + * to free this region multiple times (ioremap can reuse the same address + * for many mappings). + */ +static void do_test_bulk_ioremapping(void) +{ + void __iomem *p; + int i; + + for (i = 0; i < 10; ++i) { + p = ioremap_nocache(mmio_address, PAGE_SIZE); + if (p) + iounmap(p); + } + + /* Force freeing. If it will crash we will know why. */ + synchronize_rcu(); +} + static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); @@ -104,6 +125,7 @@ static int __init init(void) "and writing 16 kB of rubbish in there.\n", size >> 10, mmio_address); do_test(size); + do_test_bulk_ioremapping(); pr_info("All done.\n"); return 0; } -- cgit From d7a0380dc3e6607d30ccdfc3cfc2ccee0d966716 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 16 Jun 2010 22:30:42 +0200 Subject: x86-64, mm: Initialize VDSO earlier on 64 bits When initrd is in use and a driver does request_module() in its module_init (i.e. __initcall or device_initcall), a modprobe process is created with VDSO mapping. But VDSO is inited even in __initcall, i.e. on the same level (at the same time), so it may not be inited yet (link order matters). Move the VDSO initialization code earlier by switching to something before rootfs_initcall where initrd is loaded as rootfs. Specifically to subsys_initcall. Do it for standard 64-bit path (init_vdso_vars) and for compat (sysenter_setup), just in case people have 32-bit initrd and ia32 emulation built-in. i386 (pure 32-bit) is not affected, since sysenter_setup() is called from check_bugs()->identify_boot_cpu() in start_kernel() before rest_init()->kernel_thread(kernel_init) where even kernel_init() calls do_basic_setup()->do_initcalls(). What this patch fixes are early modprobe crashes such as: Unpacking initramfs... Freeing initrd memory: 9324k freed modprobe[368]: segfault at 7fff4429c020 ip 00007fef397e160c \ sp 00007fff442795c0 error 4 in ld-2.11.2.so[7fef397df000+1f000] Signed-off-by: Jiri Slaby LKML-Reference: <1276720242-13365-1-git-send-email-jslaby@suse.cz> Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso32-setup.c | 2 +- arch/x86/vdso/vma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 02b442e92007..36df991985b2 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -374,7 +374,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_X86_64 -__initcall(sysenter_setup); +subsys_initcall(sysenter_setup); #ifdef CONFIG_SYSCTL /* Register vsyscall32 into the ABI table */ diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869b8140..43456ee17692 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -74,7 +74,7 @@ static int __init init_vdso_vars(void) vdso_enabled = 0; return -ENOMEM; } -__initcall(init_vdso_vars); +subsys_initcall(init_vdso_vars); struct linux_binprm; -- cgit From 05d0b0889ca9d033a960542af7f8a13b3ad4f630 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Jun 2010 14:36:26 -0700 Subject: x86, vdso: Error out if the vdso contains external references The vdso is a piece of userspace code which is supposed to be fully self-contained. Any external (undefined) reference is an error, and should be caught at compile time. This was giving us trouble when compiling with -Os on gcc 4.5.0, for example (failed inline). The need to do a buildtime check was pointed out by Andi Kleen. Reported-by: Andi Kleen LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 3 ++- arch/x86/vdso/checkundef.sh | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100755 arch/x86/vdso/checkundef.sh (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 6b4ffedb93c9..4a2afa1bac51 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -120,7 +120,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE quiet_cmd_vdso = VDSO $@ cmd_vdso = $(CC) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) GCOV_PROFILE := n diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh new file mode 100755 index 000000000000..490be1c38f94 --- /dev/null +++ b/arch/x86/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +"$nm" "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi -- cgit From fd699c76552bbfa66631f019be415a87dbb08237 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Fri, 18 Jun 2010 17:46:53 -0400 Subject: x86, olpc: Add support for calling into OpenFirmware Add support for saving OFW's cif, and later calling into it to run OFW commands. OFW remains resident in memory, living within virtual range 0xff800000 - 0xffc00000. A single page directory entry points to the pgdir that OFW actually uses, so rather than saving the entire page table, we grab and install that one entry permanently in the kernel's page table. This is currently only used by the OLPC XO. Note that this particular calling convention breaks PAE and PAT, and so cannot be used on newer x86 hardware. Signed-off-by: Andres Salomon LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 9 ++++ arch/x86/include/asm/bootparam.h | 11 ++++- arch/x86/include/asm/olpc_ofw.h | 31 ++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head_32.S | 6 +++ arch/x86/kernel/olpc.c | 12 ++--- arch/x86/kernel/olpc_ofw.c | 104 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 6 +++ 8 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 arch/x86/include/asm/olpc_ofw.h create mode 100644 arch/x86/kernel/olpc_ofw.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a66..71c194db2e0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2062,6 +2062,15 @@ config OLPC Add support for detecting the unique features of the OLPC XO hardware. +config OLPC_OPENFIRMWARE + bool "Support for OLPC's Open Firmware" + depends on !X86_64 && !X86_PAE + default y if OLPC + help + This option adds support for the implementation of Open Firmware + that is used on the OLPC XO-1 Children's Machine. + If unsure, say N here. + endif # X86_32 config K8_NB diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 6be33d83c716..8e6218550e77 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -70,6 +70,14 @@ struct sys_desc_table { __u8 table[14]; }; +/* Gleaned from OFW's set-parameters in cpu/x86/pc/linux.fth */ +struct olpc_ofw_header { + __u32 ofw_magic; /* OFW signature */ + __u32 ofw_version; + __u32 cif_handler; /* callback into OFW */ + __u32 irq_desc_table; +} __attribute__((packed)); + struct efi_info { __u32 efi_loader_signature; __u32 efi_systab; @@ -92,7 +100,8 @@ struct boot_params { __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* 0x0a0 */ - __u8 _pad4[144]; /* 0x0b0 */ + struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */ + __u8 _pad4[128]; /* 0x0c0 */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h new file mode 100644 index 000000000000..3e63d857c48f --- /dev/null +++ b/arch/x86/include/asm/olpc_ofw.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_OLPC_OFW_H +#define _ASM_X86_OLPC_OFW_H + +/* index into the page table containing the entry OFW occupies */ +#define OLPC_OFW_PDE_NR 1022 + +#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ + +#ifdef CONFIG_OLPC_OPENFIRMWARE + +/* run an OFW command by calling into the firmware */ +#define olpc_ofw(name, args, res) \ + __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) + +extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, + void **res); + +/* determine whether OFW is available and lives in the proper memory */ +extern void olpc_ofw_detect(void); + +/* install OFW's pde permanently into the kernel's pgtable */ +extern void setup_olpc_ofw_pgd(void); + +#else /* !CONFIG_OLPC_OPENFIRMWARE */ + +static inline void olpc_ofw_detect(void) { } +static inline void setup_olpc_ofw_pgd(void) { } + +#endif /* !CONFIG_OLPC_OPENFIRMWARE */ + +#endif /* _ASM_X86_OLPC_OFW_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b22083721..0925676266bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..ff4c453e13f3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -131,6 +131,12 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_OLPC_OPENFIRMWARE + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..156605281f56 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -21,10 +21,7 @@ #include #include #include - -#ifdef CONFIG_OPEN_FIRMWARE -#include -#endif +#include struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); @@ -188,14 +185,15 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OPEN_FIRMWARE +#ifdef CONFIG_OLPC_OPENFIRMWARE static void __init platform_detect(void) { size_t propsize; __be32 rev; + void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + void *res[] = { &propsize }; - if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, - &propsize) || propsize != 4) { + if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); rev = cpu_to_be32(0); } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..469ee4384295 --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* address of OFW callback interface; will be NULL if OFW isn't found */ +static int (*olpc_ofw_cif)(int *); + +/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ +u32 olpc_ofw_pgd __initdata; + +static DEFINE_SPINLOCK(ofw_lock); + +#define MAXARGS 10 + +void __init setup_olpc_ofw_pgd(void) +{ + pgd_t *base, *ofw_pde; + + if (!olpc_ofw_cif) + return; + + /* fetch OFW's PDE */ + base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); + if (!base) { + printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); + olpc_ofw_cif = NULL; + return; + } + ofw_pde = &base[OLPC_OFW_PDE_NR]; + + /* install OFW's PDE permanently into the kernel's pgtable */ + set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); +} + +int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, + void **res) +{ + int ofw_args[MAXARGS + 3]; + unsigned long flags; + int ret, i, *p; + + BUG_ON(nr_args + nr_res > MAXARGS); + + if (!olpc_ofw_cif) + return -EIO; + + ofw_args[0] = (int)name; + ofw_args[1] = nr_args; + ofw_args[2] = nr_res; + + p = &ofw_args[3]; + for (i = 0; i < nr_args; i++, p++) + *p = (int)args[i]; + + /* call into ofw */ + spin_lock_irqsave(&ofw_lock, flags); + ret = olpc_ofw_cif(ofw_args); + spin_unlock_irqrestore(&ofw_lock, flags); + + if (!ret) { + for (i = 0; i < nr_res; i++, p++) + *((int *)res[i]) = *p; + } + + return ret; +} +EXPORT_SYMBOL_GPL(__olpc_ofw); + +/* OFW cif _should_ be above this address */ +#define OFW_MIN 0xff000000 + +/* OFW starts on a 1MB boundary */ +#define OFW_BOUND (1<<20) + +void __init olpc_ofw_detect(void) +{ + struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; + unsigned long start; + + /* ensure OFW booted us by checking for "OFW " string */ + if (hdr->ofw_magic != OLPC_OFW_SIG) + return; + + olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; + + if ((unsigned long)olpc_ofw_cif < OFW_MIN) { + printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", + (unsigned long)olpc_ofw_cif); + olpc_ofw_cif = NULL; + return; + } + + /* determine where OFW starts in memory */ + start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); + printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", + (unsigned long)olpc_ofw_cif, (-start) >> 20); + reserve_top_address(-start); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd031..b008e7883207 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -102,6 +102,7 @@ #include #include +#include #include #include @@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + /* OFW also may relocate the fixmap */ + olpc_ofw_detect(); + early_trap_init(); early_cpu_init(); early_ioremap_init(); + setup_olpc_ofw_pgd(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; -- cgit From 75a9cac430a1bd2a5219c74508ca01b0ddfddc9a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 23 Jun 2010 20:27:00 -0400 Subject: x86, olpc: Add comment about implicit optimization barrier Signed-off-by: Andres Salomon Cc: H. Peter Anvin LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc_ofw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index 469ee4384295..f5d499fbe74f 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -35,6 +35,8 @@ void __init setup_olpc_ofw_pgd(void) /* install OFW's PDE permanently into the kernel's pgtable */ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + /* implicit optimization barrier here due to uninline function return */ + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -- cgit From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 29 Jun 2010 10:07:14 +0200 Subject: workqueue: increase max_active of keventd and kill current_is_keventd() Define WQ_MAX_ACTIVE and create keventd with max_active set to half of it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1 works concurrently. Unless some combination can result in dependency loop longer than max_active, deadlock won't happen and thus it's unnecessary to check whether current_is_keventd() before trying to schedule a work. Kill current_is_keventd(). (Lockdep annotations are broken. We need lock_map_acquire_read_norecurse()) Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Tony Luck Cc: Andi Kleen Cc: Oleg Nesterov --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..4d90f376e985 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up() || current_is_keventd()) + if (!keventd_up()) c_idle.work.func(&c_idle.work); else { schedule_work(&c_idle.work); -- cgit From ea812ca1b06113597adcd8e70c0f84a413d97544 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 29 Jun 2010 18:38:00 +0000 Subject: x86: Align skb w/ start of cacheline on newer core 2/Xeon Arch x86 architectures can handle unaligned accesses in hardware, and it has been shown that unaligned DMA accesses can be expensive on Nehalem architectures. As such we should overwrite NET_IP_ALIGN to resolve this issue. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Acked-by: H. Peter Anvin Signed-off-by: David S. Miller --- arch/x86/include/asm/system.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index b8fe48ee2ed9..b4293fc8b798 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -457,4 +457,13 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } +#ifdef CONFIG_MCORE2 +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 +#endif #endif /* _ASM_X86_SYSTEM_H */ -- cgit From 7475271004b66e9c22e1bb28f240a38c5d6fe76e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 Jul 2010 13:28:27 +0000 Subject: x86: Drop CONFIG_MCORE2 check around setting of NET_IP_ALIGN This patch removes the CONFIG_MCORE2 check from around NET_IP_ALIGN. It is based on a suggestion from Andi Kleen. The assumption is that there are not any x86 cores where unaligned access is really slow, and this change would allow for a performance improvement to still exist on configurations that are not necessarily optimized for Core 2. Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Acked-by: H. Peter Anvin Signed-off-by: David S. Miller --- arch/x86/include/asm/system.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index b4293fc8b798..1db9bd2281d7 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -457,7 +457,6 @@ static inline void rdtsc_barrier(void) alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); } -#ifdef CONFIG_MCORE2 /* * We handle most unaligned accesses in hardware. On the other hand * unaligned DMA can be quite expensive on some Nehalem processors. @@ -465,5 +464,4 @@ static inline void rdtsc_barrier(void) * Based on this we disable the IP header alignment in network drivers. */ #define NET_IP_ALIGN 0 -#endif #endif /* _ASM_X86_SYSTEM_H */ -- cgit From 8e221b6db4477643fefc885a97ea9889ac733140 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 22 Jun 2010 16:23:37 -0700 Subject: x86: Avoid unnecessary __clear_user() and xrstor in signal handling fxsave/xsave doesn't touch all the bytes in the memory layout used by these instructions. Specifically SW reserved (bytes 464..511) fields in the fxsave frame and the reserved fields in the xsave header. To present a clean context for the signal handling, just clear these fields instead of clearing the complete fxsave/xsave memory layout, when we dump these registers directly to the user signal frame. Also avoid the call to second xrstor (which inits the state not passed in the signal frame) in restore_user_xstate() if all the state has already been restored by the first xrstor. These changes improve the performance of signal handling(by ~3-5% as measured by the lat_sig). Signed-off-by: Suresh Siddha LKML-Reference: <1277249017.2847.85.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 9 +++++++++ arch/x86/include/asm/xsave.h | 10 ++++++++++ arch/x86/kernel/xsave.c | 12 ++---------- 3 files changed, 21 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b904..0f1cf5d53dd8 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -127,6 +127,15 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { int err; + /* + * Clear the bytes not touched by the fxsave and reserved + * for the SW usage. + */ + err = __clear_user(&fx->sw_reserved, + sizeof(struct _fpx_sw_bytes)); + if (unlikely(err)) + return -EFAULT; + asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae228..30dfc81804d5 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -59,6 +59,16 @@ static inline int fpu_xrstor_checking(struct fpu *fpu) static inline int xsave_user(struct xsave_struct __user *buf) { int err; + + /* + * Clear the xsave header first, so that reserved fields are + * initialized to zero. + */ + err = __clear_user(&buf->xsave_hdr, + sizeof(struct xsave_hdr_struct)); + if (unlikely(err)) + return -EFAULT; + __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" "2:\n" ".section .fixup,\"ax\"\n" diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..6e73db1b7b4e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -91,14 +91,6 @@ int save_i387_xstate(void __user *buf) return 0; if (task_thread_info(tsk)->status & TS_USEDFPU) { - /* - * Start with clearing the user buffer. This will present a - * clean context for the bytes not touched by the fxsave/xsave. - */ - err = __clear_user(buf, sig_xstate_size); - if (err) - return err; - if (use_xsave()) err = xsave_user(buf); else @@ -184,8 +176,8 @@ static int restore_user_xstate(void __user *buf) * init the state skipped by the user. */ mask = pcntxt_mask & ~mask; - - xrstor_state(init_xstate_buf, mask); + if (unlikely(mask)) + xrstor_state(init_xstate_buf, mask); return 0; -- cgit From 24da9c26f3050aee9314ec09930a24c80fe76352 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 7 Jul 2010 10:15:12 -0700 Subject: x86, cpu: Add CPU flags for F16C and RDRND Add support for the newly documented F16C (16-bit floating point conversions) and RDRND (RDRAND instruction) CPU feature flags. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 2a904f4071fe..aeb6f3f9b2ca 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -124,6 +124,8 @@ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -- cgit From 83a7a2ad2a9173dcabc05df0f01d1d85b7ba1c2c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 10 Jun 2010 00:10:43 +0000 Subject: x86, alternatives: Use 16-bit numbers for cpufeature index We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 7 ++++--- arch/x86/include/asm/cpufeature.h | 14 ++++++++------ arch/x86/kernel/entry_32.S | 2 +- arch/x86/lib/clear_page_64.S | 2 +- arch/x86/lib/copy_page_64.S | 2 +- arch/x86/lib/memcpy_64.S | 2 +- arch/x86/lib/memset_64.S | 2 +- 7 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 03b6bb5394a0..bc6abb7bc7ee 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,10 +45,9 @@ struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; - u8 cpuid; /* cpuid bit set for replacement */ + u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ - u8 pad1; #ifdef CONFIG_X86_64 u32 pad2; #endif @@ -86,9 +85,11 @@ static inline int alternatives_text_reserved(void *start, void *end) _ASM_ALIGN "\n" \ _ASM_PTR "661b\n" /* label */ \ _ASM_PTR "663f\n" /* new instruction */ \ - " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ ".previous\n" \ ".section .altinstr_replacement, \"ax\"\n" \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 468145914389..e8b88967de35 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -291,7 +291,7 @@ extern const char * const x86_power_flags[32]; * patch the target code for additional performance. * */ -static __always_inline __pure bool __static_cpu_has(u8 bit) +static __always_inline __pure bool __static_cpu_has(u16 bit) { #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) asm goto("1: jmp %l[t_no]\n" @@ -300,11 +300,11 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) _ASM_ALIGN "\n" _ASM_PTR "1b\n" _ASM_PTR "0\n" /* no replacement */ - " .byte %P0\n" /* feature bit */ + " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ - " .byte 0xff + 0 - (2b-1b)\n" /* padding */ ".previous\n" + /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); return true; t_no: @@ -318,10 +318,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) _ASM_ALIGN "\n" _ASM_PTR "1b\n" _ASM_PTR "3f\n" - " .byte %P1\n" /* feature bit */ + " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ - " .byte 0xff + (4f-3f) - (2b-1b)\n" /* padding */ + ".previous\n" + ".section .discard,\"aw\",@progbits\n" + " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" "3: movb $1,%0\n" @@ -337,7 +339,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ - (__builtin_constant_p(bit) && !((bit) & ~0xff)) ? \ + __builtin_constant_p(bit) ? \ __static_cpu_has(bit) : \ boot_cpu_has(bit) \ ) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..7862cf510ea9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -914,7 +914,7 @@ ENTRY(simd_coprocessor_error) .balign 4 .long 661b .long 663f - .byte X86_FEATURE_XMM + .word X86_FEATURE_XMM .byte 662b-661b .byte 664f-663f .previous diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index ebeafcce04a9..aa4326bfb24a 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -52,7 +52,7 @@ ENDPROC(clear_page) .align 8 .quad clear_page .quad 1b - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lclear_page_end - clear_page .byte 2b - 1b .previous diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 727a5d46d2fc..6fec2d1cebe1 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -113,7 +113,7 @@ ENDPROC(copy_page) .align 8 .quad copy_page .quad 1b - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lcopy_page_end - copy_page .byte 2b - 1b .previous diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index f82e884928af..bcbcd1e0f7d5 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -131,7 +131,7 @@ ENDPROC(__memcpy) .align 8 .quad memcpy .quad .Lmemcpy_c - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD /* * Replace only beginning, memcpy is used to apply alternatives, diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index e88d3b81644a..09d344269652 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -121,7 +121,7 @@ ENDPROC(__memset) .align 8 .quad memset .quad .Lmemset_c - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lfinal - memset .byte .Lmemset_e - .Lmemset_c .previous -- cgit From bdc802dcca1709b01988d57e91f9f35ce1609fcc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 7 Jul 2010 17:29:18 -0700 Subject: x86, cpu: Support the features flags in new CPUID leaf 7 Intel has defined CPUID leaf 7 as the next set of feature flags (see the AVX specification, version 007). Add support for this new feature flags word. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cpufeature.h | 13 +++++++++---- arch/x86/include/asm/required-features.h | 2 ++ arch/x86/kernel/cpu/common.c | 10 ++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index aeb6f3f9b2ca..3ec9275cea46 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -6,7 +6,7 @@ #include -#define NCAPINTS 9 /* N 32-bit words worth of info */ +#define NCAPINTS 10 /* N 32-bit words worth of info */ /* * Note: If the comment begins with a quoted string, that string is used @@ -159,14 +159,14 @@ /* * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc + * CPUID levels like 0x6, 0xA etc, word 7 */ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -/* Virtualization flags: Linux defined */ +/* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ @@ -177,6 +177,9 @@ #define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ #define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE (9*32+0) /* {RD/WR}{FS/GS}BASE instructions*/ + #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include @@ -197,7 +200,9 @@ extern const char * const x86_power_flags[32]; (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ - (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ + (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ + (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ ? 1 : \ test_cpu_cap(c, bit)) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 64cf2d24fad1..6c7fc25f2c34 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -84,5 +84,7 @@ #define REQUIRED_MASK5 0 #define REQUIRED_MASK6 0 #define REQUIRED_MASK7 0 +#define REQUIRED_MASK8 0 +#define REQUIRED_MASK9 0 #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211e..c7358303d8cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + if (eax > 0) + c->x86_capability[9] = ebx; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; -- cgit From 5bbd4a336c81d32df71642abf310cf3d0c98dc9b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 7 Jul 2010 19:51:59 -0400 Subject: x86/apic/es7000_32: Remove unused variable In today's linux-next I got this compile warning: arch/x86/kernel/apic/es7000_32.c:132: warning: 'base' defined but not used Current patch solves the issue removing the unused variable. Signed-off-by: Javier Martinez Canillas Cc: Rakib Mullick Cc: Eric W. Biederman Cc: Cyrill Gorcunov Cc: Tejun Heo LKML-Reference: <1278546719.9020.4.camel@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a87feb..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -129,7 +129,6 @@ int es7000_plat; * GSI override for ES7000 platforms. */ -static unsigned int base; static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { -- cgit From 9279aa55061a280b826bdf9ba5ab5f6a566c1dfb Mon Sep 17 00:00:00 2001 From: Ky Srinivasan Date: Mon, 28 Jun 2010 08:48:55 -0600 Subject: x86: Export the symbol ms_hyperv This is needed so that the staging hyperv can properly access this symbol. Signed-off-by: K. Y. Srinivasan Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mshyperv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bbe46b6..d944bf6c50e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); static bool __init ms_hyperv_platform(void) { -- cgit From ffa71f33a820d1ab3f2fc5723819ac60fb76080b Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Fri, 18 Jun 2010 12:22:40 +0900 Subject: x86, ioremap: Fix incorrect physical address handling in PAE mode Current x86 ioremap() doesn't handle physical address higher than 32-bit properly in X86_32 PAE mode. When physical address higher than 32-bit is passed to ioremap(), higher 32-bits in physical address is cleared wrongly. Due to this bug, ioremap() can map wrong address to linear address space. In my case, 64-bit MMIO region was assigned to a PCI device (ioat device) on my system. Because of the ioremap()'s bug, wrong physical address (instead of MMIO region) was mapped to linear address space. Because of this, loading ioatdma driver caused unexpected behavior (kernel panic, kernel hangup, ...). Signed-off-by: Kenji Kaneshige LKML-Reference: <4C1AE680.7090408@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 12e4d2d3c110..754cb4cbce66 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -62,8 +62,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, unsigned long prot_val, void *caller) { - unsigned long pfn, offset, vaddr; - resource_size_t last_addr; + unsigned long offset, vaddr; + resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; @@ -100,10 +100,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (pfn = phys_addr >> PAGE_SHIFT; - (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); - pfn++) { - + last_pfn = last_addr >> PAGE_SHIFT; + for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) { int is_ram = page_is_ram(pfn); if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) @@ -115,7 +113,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; + phys_addr &= PHYSICAL_PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, -- cgit From 35be1b716a475717611b2dc04185e9d80b9cb693 Mon Sep 17 00:00:00 2001 From: Kenji Kaneshige Date: Fri, 18 Jun 2010 12:23:57 +0900 Subject: x86, ioremap: Fix normal ram range check Check for normal RAM in x86 ioremap() code seems to not work for the last page frame in the specified physical address range. Signed-off-by: Kenji Kaneshige LKML-Reference: <4C1AE6CD.1080704@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 754cb4cbce66..d41d3a9036ca 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -101,7 +101,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Don't allow anybody to remap normal RAM that we're using.. */ last_pfn = last_addr >> PAGE_SHIFT; - for (pfn = phys_addr >> PAGE_SHIFT; pfn < last_pfn; pfn++) { + for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { int is_ram = page_is_ram(pfn); if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) -- cgit From fa97bdf92709adaaf8b9a5164a895e262a4fcf60 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 11 Jul 2010 11:06:57 +0300 Subject: x86, setup: Early-boot serial I/O support This patch adds serial I/O support to the real-mode setup (very early boot) printf(). It's useful for debugging boot code when running Linux under KVM, for example. The actual code was lifted from early printk. Cc: Cyrill Gorcunov Cc: Ingo Molnar Cc: Yinghai Lu Signed-off-by: Pekka Enberg LKML-Reference: <1278835617-11368-1-git-send-email-penberg@cs.helsinki.fi> Signed-off-by: H. Peter Anvin --- arch/x86/boot/boot.h | 16 +++++++ arch/x86/boot/main.c | 3 ++ arch/x86/boot/string.c | 41 ++++++++++++++++++ arch/x86/boot/tty.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 165 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 98239d2658f2..46c4c5c71af7 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -37,6 +37,8 @@ extern struct setup_header hdr; extern struct boot_params boot_params; +#define cpu_relax() asm volatile("rep; nop") + /* Basic port I/O */ static inline void outb(u8 v, u16 port) { @@ -203,6 +205,17 @@ static inline int isdigit(int ch) return (ch >= '0') && (ch <= '9'); } +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + /* Heap -- available for dynamic lists. */ extern char _end[]; extern char *HEAP; @@ -329,10 +342,13 @@ void initregs(struct biosregs *regs); /* string.c */ int strcmp(const char *str1, const char *str2); +int strncmp(const char *cs, const char *ct, size_t count); size_t strnlen(const char *s, size_t maxlen); unsigned int atou(const char *s); +unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); /* tty.c */ +void console_init(void); void puts(const char *); void putchar(int); int getchar(void); diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 140172b895bd..4ef1a33e8572 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -130,6 +130,9 @@ void main(void) /* First, copy the boot header into the "zeropage" */ copy_boot_params(); + /* Initialize the early-boot console */ + console_init(); + /* End of heap check */ init_heap(); diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index f94b7a0c2abf..aba29df4a7bb 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -30,6 +30,22 @@ int strcmp(const char *str1, const char *str2) return 0; } +int strncmp(const char *cs, const char *ct, size_t count) +{ + unsigned char c1, c2; + + while (count) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + count--; + } + return 0; +} + size_t strnlen(const char *s, size_t maxlen) { const char *es = s; @@ -48,3 +64,28 @@ unsigned int atou(const char *s) i = i * 10 + (*s++ - '0'); return i; } + +/* Works only for digits and letters, but small and fast */ +#define TOLOWER(x) ((x) | 0x20) + +unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) +{ + unsigned long long result = 0; + + if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') + cp += 2; + + while (isxdigit(*cp)) { + unsigned int value; + + value = isdigit(*cp) ? *cp - '0' : TOLOWER(*cp) - 'a' + 10; + if (value >= base) + break; + result = result * base + value; + cp++; + } + if (endp) + *endp = (char *)cp; + + return result; +} diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 01ec69c901c7..f3ceee20ff12 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -10,23 +10,51 @@ * ----------------------------------------------------------------------- */ /* - * Very simple screen I/O - * XXX: Probably should add very simple serial I/O? + * Very simple screen and serial I/O */ #include "boot.h" +#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ + +static int early_serial_base; + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +#define DEFAULT_BAUD 9600 + /* * These functions are in .inittext so they can be used to signal * error during initialization. */ -void __attribute__((section(".inittext"))) putchar(int ch) +static void __attribute__((section(".inittext"))) serial_putchar(int ch) { - struct biosregs ireg; + unsigned timeout = 0xffff; - if (ch == '\n') - putchar('\r'); /* \n -> \r\n */ + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + + outb(ch, early_serial_base + TXR); +} + +static void __attribute__((section(".inittext"))) bios_putchar(int ch) +{ + struct biosregs ireg; initregs(&ireg); ireg.bx = 0x0007; @@ -36,6 +64,17 @@ void __attribute__((section(".inittext"))) putchar(int ch) intcall(0x10, &ireg, NULL); } +void __attribute__((section(".inittext"))) putchar(int ch) +{ + if (ch == '\n') + putchar('\r'); /* \n -> \r\n */ + + bios_putchar(ch); + + if (early_serial_base != 0) + serial_putchar(ch); +} + void __attribute__((section(".inittext"))) puts(const char *str) { while (*str) @@ -112,3 +151,63 @@ int getchar_timeout(void) return 0; /* Timeout! */ } + +static void early_serial_init(int baud) +{ + unsigned char c; + unsigned divisor; + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +void console_init(void) +{ + int baud = DEFAULT_BAUD; + char arg[32]; + int pos = 0; + + if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { + char *e; + + if (!strncmp(arg, "serial", 6)) { + early_serial_base = DEFAULT_SERIAL_PORT; + pos += 6; + } + + if (arg[pos] == ',') + pos++; + + if (!strncmp(arg, "ttyS", 4)) { + static const int bases[] = { 0x3f8, 0x2f8 }; + int port = 0; + + if (!strncmp(arg + pos, "ttyS", 4)) + pos += 4; + + if (arg[pos++] == '1') + port = 1; + + early_serial_base = bases[port]; + } + + if (arg[pos] == ',') + pos++; + + baud = simple_strtoull(arg + pos, &e, 0); + if (baud == 0 || arg + pos == e) + baud = DEFAULT_BAUD; + } + + if (early_serial_base != 0) + early_serial_init(baud); +} -- cgit From ce0aa5dd20e44372f9617dd67c984f41fcdbed88 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 13 Jul 2010 13:35:17 -0700 Subject: x86, setup: Make the setup code also accept console=uart8250 Make the boot code also accept the console=uart8250,io,0x2f8,115200n form of early console. Also add back simple_guess_base(), otherwise those simple_strtoull(,,0) are not going to work. Signed-off-by: Yinghai Lu LKML-Reference: <4C3CCE05.4090505@kernel.org> Acked-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/boot/string.c | 22 +++++++++++++++++++ arch/x86/boot/tty.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index aba29df4a7bb..3cbc4058dd26 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -68,10 +68,32 @@ unsigned int atou(const char *s) /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) +static unsigned int simple_guess_base(const char *cp) +{ + if (cp[0] == '0') { + if (TOLOWER(cp[1]) == 'x' && isxdigit(cp[2])) + return 16; + else + return 8; + } else { + return 10; + } +} + +/** + * simple_strtoull - convert a string to an unsigned long long + * @cp: The start of the string + * @endp: A pointer to the end of the parsed string will be placed here + * @base: The number base to use + */ + unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { unsigned long long result = 0; + if (!base) + base = simple_guess_base(cp); + if (base == 16 && cp[0] == '0' && TOLOWER(cp[1]) == 'x') cp += 2; diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index f3ceee20ff12..f6d52e65f97a 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -170,7 +170,7 @@ static void early_serial_init(int baud) outb(c & ~DLAB, early_serial_base + LCR); } -void console_init(void) +static int parse_earlyprintk(void) { int baud = DEFAULT_BAUD; char arg[32]; @@ -208,6 +208,63 @@ void console_init(void) baud = DEFAULT_BAUD; } + return baud; +} + +#define BASE_BAUD (1843200/16) +static unsigned int probe_baud(int port) +{ + unsigned char lcr, dll, dlh; + unsigned int quot; + + lcr = inb(port + LCR); + outb(lcr | DLAB, port + LCR); + dll = inb(port + DLL); + dlh = inb(port + DLH); + outb(lcr, port + LCR); + quot = (dlh << 8) | dll; + + return BASE_BAUD / quot; +} + +static int parse_console_uart8250(void) +{ + char optstr[64], *options; + int baud = DEFAULT_BAUD; + + /* + * console=uart8250,io,0x3f8,115200n8 + * need to make sure it is last one console ! + */ + if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) + return baud; + + options = optstr; + + if (!strncmp(options, "uart8250,io,", 12)) + early_serial_base = simple_strtoull(options + 12, &options, 0); + else if (!strncmp(options, "uart,io,", 8)) + early_serial_base = simple_strtoull(options + 8, &options, 0); + else + return baud; + + if (options && (options[0] == ',')) + baud = simple_strtoull(options + 1, &options, 0); + else + baud = probe_baud(early_serial_base); + + return baud; +} + +void console_init(void) +{ + int baud; + + baud = parse_earlyprintk(); + + if (!early_serial_base) + baud = parse_console_uart8250(); + if (early_serial_base != 0) early_serial_init(baud); } -- cgit From df378ccfc4dd04e263426ad805516915874774aa Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Jul 2010 14:55:11 -0700 Subject: x86, alternatives: Fix one more open-coded 8-bit alternative number Fix a missing case of an 8-bit alternative number, buried inside an assembly macro. Signed-off-by: H. Peter Anvin Reported-by: Yinghai Lu Cc: Suresh Siddha LKML-Reference: <4C3BDDA3.2060900@kernel.org> --- arch/x86/lib/copy_user_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 71100c98e337..a460158b5ac5 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -29,7 +29,7 @@ .align 8 .quad 0b .quad 2b - .byte \feature /* when feature is set */ + .word \feature /* when feature is set */ .byte 5 .byte 5 .previous -- cgit From 3b770a2128423a687e6e9c57184a584fb4ba4c77 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Jul 2010 14:57:50 -0700 Subject: x86, alternatives: BUG on encountering an invalid CPU feature number Make the alternatives-patching code BUG on encountering an invalid CPU feature number. Should have done this a long time ago. Signed-off-by: H. Peter Anvin Cc: Yinghai Lu Cc: Suresh Siddha LKML-Reference: --- arch/x86/kernel/alternative.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 70237732a6c7..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; #ifdef CONFIG_X86_64 -- cgit From 70b0d22d581a5deef7b2876b0c3774635b8d846c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 14 Jul 2010 11:26:57 -0700 Subject: x86, setup: Only set early_serial_base after port is initialized putchar is using early_serial_base to check if port is initialized. So we only assign it after early_serial_init() is called, in case we need use VGA to debug early serial console. Also add display for port addr and baud. -v2: update to current tip Acked-by: Pekka Enberg Signed-off-by: Yinghai Lu LKML-Reference: <4C3E0171.6050008@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/boot/tty.c | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index f6d52e65f97a..ff4b27a0fc5e 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -152,35 +152,40 @@ int getchar_timeout(void) return 0; /* Timeout! */ } -static void early_serial_init(int baud) +static void early_serial_init(int port, int baud) { unsigned char c; unsigned divisor; - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + outb(0x3, port + LCR); /* 8n1 */ + outb(0, port + IER); /* no interrupt */ + outb(0, port + FCR); /* no fifo */ + outb(0x3, port + MCR); /* DTR + RTS */ divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); + c = inb(port + LCR); + outb(c | DLAB, port + LCR); + outb(divisor & 0xff, port + DLL); + outb((divisor >> 8) & 0xff, port + DLH); + outb(c & ~DLAB, port + LCR); + + early_serial_base = port; + + printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud); } -static int parse_earlyprintk(void) +static void parse_earlyprintk(void) { int baud = DEFAULT_BAUD; char arg[32]; int pos = 0; + int port = 0; if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { char *e; if (!strncmp(arg, "serial", 6)) { - early_serial_base = DEFAULT_SERIAL_PORT; + port = DEFAULT_SERIAL_PORT; pos += 6; } @@ -189,15 +194,15 @@ static int parse_earlyprintk(void) if (!strncmp(arg, "ttyS", 4)) { static const int bases[] = { 0x3f8, 0x2f8 }; - int port = 0; + int idx = 0; if (!strncmp(arg + pos, "ttyS", 4)) pos += 4; if (arg[pos++] == '1') - port = 1; + idx = 1; - early_serial_base = bases[port]; + port = bases[idx]; } if (arg[pos] == ',') @@ -208,7 +213,8 @@ static int parse_earlyprintk(void) baud = DEFAULT_BAUD; } - return baud; + if (port) + early_serial_init(port, baud); } #define BASE_BAUD (1843200/16) @@ -227,44 +233,41 @@ static unsigned int probe_baud(int port) return BASE_BAUD / quot; } -static int parse_console_uart8250(void) +static void parse_console_uart8250(void) { char optstr[64], *options; int baud = DEFAULT_BAUD; + int port = 0; /* * console=uart8250,io,0x3f8,115200n8 * need to make sure it is last one console ! */ if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) - return baud; + return; options = optstr; if (!strncmp(options, "uart8250,io,", 12)) - early_serial_base = simple_strtoull(options + 12, &options, 0); + port = simple_strtoull(options + 12, &options, 0); else if (!strncmp(options, "uart,io,", 8)) - early_serial_base = simple_strtoull(options + 8, &options, 0); + port = simple_strtoull(options + 8, &options, 0); else - return baud; + return; if (options && (options[0] == ',')) baud = simple_strtoull(options + 1, &options, 0); else - baud = probe_baud(early_serial_base); + baud = probe_baud(port); - return baud; + if (port) + early_serial_init(port, baud); } void console_init(void) { - int baud; - - baud = parse_earlyprintk(); + parse_earlyprintk(); if (!early_serial_base) - baud = parse_console_uart8250(); - - if (early_serial_base != 0) - early_serial_init(baud); + parse_console_uart8250(); } -- cgit From b2691085d1f3ccce641dcfdd02722ba5d34db6ba Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 28 Jun 2010 16:46:48 -0700 Subject: x86: Clean up arch/x86/kernel/cpu/mtrr/cleanup.c: use ";" not "," to terminate statements Also needed if pr_ becomes a bit more space efficient. Signed-off-by: Joe Perches Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" LKML-Reference: <1277768808.29157.280.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) unsigned long gran_base, chunk_base, lose_base; char gran_factor, chunk_factor, lose_factor; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", result[i].bad ? "*BAD*" : " ", -- cgit From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 6 May 2010 20:17:00 +0200 Subject: unistd: add __NR_prlimit64 syscall numbers Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to asm-x86, both 32 and 64-bit. Signed-off-by: Jiri Slaby Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa3..a88e31d1836e 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -842,4 +842,5 @@ ia32_sys_call_table: .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open .quad compat_sys_recvmmsg + .quad sys_prlimit64 ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a4..35e0cb151b69 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -343,10 +343,11 @@ #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 +#define __NR_prlimit64 338 #ifdef __KERNEL__ -#define NR_syscalls 338 +#define NR_syscalls 339 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81e..570bf5eae564 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 299 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_prlimit64 300 +__SYSCALL(__NR_prlimit64, sys_prlimit64) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..eca1d7d23ab5 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_prlimit64 -- cgit From a2531293dbb7608fa672ff28efe3ab4027917a2f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 18 Jul 2010 14:27:13 +0200 Subject: update email address pavel@suse.cz no longer works, replace it with working address. Signed-off-by: Pavel Machek Signed-off-by: Jiri Kosina --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/power/cpu.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f51cc55aced6 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -2,7 +2,7 @@ * sleep.c - x86-specific ACPI sleep support. * * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek + * Copyright (C) 2001-2003 Pavel Machek */ #include diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c4f9182ca3ac..4c9c67bf09b7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -140,7 +140,7 @@ * is now the way life works). * Fix thinko in suspend() (wrong return). * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek + * Make kapmd absorb more idle time (Pavel Machek * modified by sfr). * Disable interrupts while we are suspended (Andy Henroid * fixed by sfr). diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e6..0af9aa20fce1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -9,7 +9,7 @@ * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski - * (C) 2004 Pavel Machek + * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee41bba315d1..9a6674689a20 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -2,7 +2,7 @@ * linux/arch/x86_64/mm/init.c * * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Pavel Machek * Copyright (C) 2002,2003 Andi Kleen */ diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1290ba54b350..e7e8c5f54956 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -4,7 +4,7 @@ * Distribute under GPLv2 * * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index d24f983ba1e5..460f314d13e5 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -4,7 +4,7 @@ * Distribute under GPLv2 * * Copyright (c) 2007 Rafael J. Wysocki - * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2002 Pavel Machek * Copyright (c) 2001 Patrick Mochel */ -- cgit From 6c54aabd5e687092557f4881ce2d4013b971f293 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 12:03:51 -0400 Subject: x86/amd-iommu: Use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c6..29dd3b9f2f09 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2609,8 +2609,7 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - + for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; -- cgit From edb18f8ab02843453306601c4aa697f9691129cd Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:50 -0700 Subject: x86, cpu: Make init_scattered_cpuid_features() consider cpuid subleaves Some cpuid features (like xsaveopt) are enumerated using cpuid subleaves. Extend init_scattered_cpuid_features() to take subleaf into account. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.439900717@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 7369b4c2c55a..03cf24a3d933 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -14,6 +14,7 @@ struct cpuid_bit { u8 reg; u8 bit; u32 level; + u32 sub_leaf; }; enum cpuid_regs { @@ -30,16 +31,16 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, - { 0, 0, 0, 0 } + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } }; for (cb = cpuid_bits; cb->feature; cb++) { @@ -50,8 +51,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); -- cgit From 40e1d7a4ffee5cb17f5c36f4c3c4a011ab103ebe Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:51 -0700 Subject: x86, cpu: Add xsaveopt cpufeature Add cpu feature bit support for the XSAVEOPT instruction. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.523204988@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3ec9275cea46..d5ea3e3a8a42 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -165,6 +165,7 @@ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_XSAVEOPT (7*32+4) /* "xsaveopt" Optimized Xsave */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ -- cgit From 5734f62b6601d88fd8ec720cb56b93fd3a030557 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, cpu: Enumerate xsaveopt Enumerate the xsaveopt feature. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 03cf24a3d933..41eebcd90fce 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -35,6 +35,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, -- cgit From a1488f8bf4d72ad724700f6e982469a1240e4264 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:48 -0700 Subject: x86, xsave: Track the offset, size of state in the xsave layout Subleaves of the cpuid vector 0xd provides the offset and size of different feature state that are managed by the xsave/xrstor. Track this for the upcoming usage during signal handling. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.262987929@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 980149867a19..4993caa4181c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -21,6 +21,8 @@ struct _fpx_sw_bytes fx_sw_reserved; struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -301,6 +303,31 @@ void __cpuinit xsave_init(void) xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + /* * setup the xstate image representing the init state */ @@ -308,6 +335,8 @@ static void __init setup_xstate_init(void) { init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + setup_xstate_features(); } /* -- cgit From 29104e101d710dd152f807978884643a52eca8b7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:49 -0700 Subject: x86, xsave: Sync xsave memory layout with its header for user handling With xsaveopt, if a processor implementation discern that a processor state component is in its initialized state it may modify the corresponding bit in the xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory layout. Hence wHile presenting the xstate information to the user, we always ensure that the memory layout of a feature will be in the init state if the corresponding header bit is zero. This ensures the consistency and avoids the condition of the user seeing some some stale state in the memory layout during signal handling, debugging etc. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.351459480@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 14 +++++++ arch/x86/include/asm/xsave.h | 10 +++++ arch/x86/kernel/i387.c | 11 ++++++ arch/x86/kernel/xsave.c | 89 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 123 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b904..bb370fd0a1c2 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -58,11 +58,25 @@ extern int restore_i387_xstate_ia32(void __user *buf); #define X87_FSW_ES (1 << 7) /* Exception Summary */ +static __always_inline __pure bool use_xsaveopt(void) +{ + return 0; +} + static __always_inline __pure bool use_xsave(void) { return static_cpu_has(X86_FEATURE_XSAVE); } +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + #ifdef CONFIG_X86_64 /* Ignore delayed exceptions from user space */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae228..0c72adc0cb15 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -111,6 +111,16 @@ static inline void xrstor_state(struct xsave_struct *fx, u64 mask) : "memory"); } +static inline void xsave_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + static inline void fpu_xsave(struct fpu *fpu) { /* This, however, we can work around by forcing the compiler to select diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b32253..6106af9fd129 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -190,6 +190,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); } @@ -207,6 +209,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); @@ -446,6 +450,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -467,6 +473,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); @@ -533,6 +541,9 @@ static int save_i387_xsave(void __user *buf) struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4993caa4181c..368047c8d507 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -23,6 +23,76 @@ struct _fpx_sw_bytes fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + + if (!fx) + return; + + BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -112,6 +182,7 @@ int save_i387_xstate(void __user *buf) task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else { + sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, xstate_size)) return -1; @@ -333,10 +404,26 @@ static void setup_xstate_features(void) */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; - setup_xstate_features(); + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* -- cgit From 6bad06b768920e278c7cedfdda56a0b4c6a35ee9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, xsave: Use xsaveopt in context-switch path when supported xsaveopt is a more optimized form of xsave specifically designed for the context switch usage. xsaveopt doesn't save the state that's not modified from the prior xrstor. And if a specific feature state gets modified to the init state, then xsaveopt just updates the header bit in the xsave memory layout without updating the corresponding memory layout. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 2 +- arch/x86/include/asm/xsave.h | 9 ++++++--- arch/x86/kernel/cpu/common.c | 8 ++++++++ 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index bb370fd0a1c2..59bd93ac7fef 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -60,7 +60,7 @@ extern int restore_i387_xstate_ia32(void __user *buf); static __always_inline __pure bool use_xsaveopt(void) { - return 0; + return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 0c72adc0cb15..ec86c5fd6a6e 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -125,8 +125,11 @@ static inline void fpu_xsave(struct fpu *fpu) { /* This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ - __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" - : : "D" (&(fpu->state->xsave)), - "a" (-1), "d"(-1) : "memory"); + alternative_input( + ".byte " REX_PREFIX "0x0f,0xae,0x27", + ".byte " REX_PREFIX "0x0f,0xae,0x37", + X86_FEATURE_XSAVEOPT, + [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) : + "memory"); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c7358303d8cd..3f715efc594d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -- cgit From 278bc5f6abd69dd868746dbd642266ac09a9c9c6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 19 Jul 2010 18:53:51 -0700 Subject: x86, cpu: Clean up formatting in cpufeature.h, remove override Clean up the formatting in cpufeature.h, and remove an unnecessary name override. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha LKML-Reference: --- arch/x86/include/asm/cpufeature.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d5ea3e3a8a42..4be50ddd4d79 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -165,7 +165,7 @@ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_XSAVEOPT (7*32+4) /* "xsaveopt" Optimized Xsave */ +#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ @@ -173,13 +173,13 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ -#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ -#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ -#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ -#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE (9*32+0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) -- cgit From 2decb194e65ab66eaf787512dc572cdc99893b24 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 19 Jul 2010 18:32:04 -0700 Subject: x86, cpu: Split addon_cpuid_features.c addon_cpuid_features.c contains exactly two almost completely unrelated functions, plus has a long and very generic name. Split it into two files, scattered.c for the scattered feature flags, and topology.c for the topology information. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/addon_cpuid_features.c | 150 ----------------------------- arch/x86/kernel/cpu/scattered.c | 61 ++++++++++++ arch/x86/kernel/cpu/topology.c | 99 +++++++++++++++++++ 4 files changed, 161 insertions(+), 151 deletions(-) delete mode 100644 arch/x86/kernel/cpu/addon_cpuid_features.c create mode 100644 arch/x86/kernel/cpu/scattered.c create mode 100644 arch/x86/kernel/cpu/topology.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..5e3a3512ba05 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,7 +12,7 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c deleted file mode 100644 index 41eebcd90fce..000000000000 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Routines to indentify additional cpu features that are scattered in - * cpuid space. - */ -#include - -#include -#include - -#include - -struct cpuid_bit { - u16 feature; - u8 reg; - u8 bit; - u32 level; - u32 sub_leaf; -}; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) -{ - u32 max_level; - u32 regs[4]; - const struct cpuid_bit *cb; - - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, - { 0, 0, 0, 0, 0 } - }; - - for (cb = cpuid_bits; cb->feature; cb++) { - - /* Verify that the level is valid */ - max_level = cpuid_eax(cb->level & 0xffff0000); - if (max_level < cb->level || - max_level > (cb->level | 0xffff)) - continue; - - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); - - if (regs[cb->reg] & (1 << cb->bit)) - set_cpu_cap(c, cb->feature); - } -} - -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 - -/* leaf 0xb sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 - -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) - -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; - - if (c->cpuid_level < 0xb) - return; - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - - /* - * check if the cpuid leaf 0xb is actually implemented. - */ - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return; - - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); - - /* - * initial apic id, which also represents 32-bit extended x2apic id. - */ - c->initial_apicid = edx; - - /* - * Populate HT related information from sub-leaf level 0. - */ - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - - /* - * Check for the Core type in the implemented sub leaves. - */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; - - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) - & core_select_mask; - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); - /* - * Reinit the apicid, now that we have extended initial_apicid. - */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); - - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } - return; -#endif -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..9815364b477e --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,61 @@ +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ +#include + +#include +#include + +#include + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 000000000000..4397e987a1cf --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,99 @@ +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include +#include +#include +#include + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + static bool printed; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } + return; +#endif +} -- cgit From 093d7b4639951ea3021a6f70d376c3ff31f4740c Mon Sep 17 00:00:00 2001 From: Miroslav Rezanina Date: Wed, 16 Sep 2009 03:56:17 -0400 Subject: xen: release unused free memory Scan an e820 table and release any memory which lies between e820 entries, as it won't be used and would just be wasted. At present this is just to release any memory beyond the end of the e820 map, but it will also deal with holes being punched in the map. Derived from patch by Miroslav Rezanina Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 1 - arch/x86/xen/setup.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a8..399bed2de881 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -731,7 +731,6 @@ static void set_xen_basic_apic_ops(void) #endif - static void xen_clts(void) { struct multicall_space mcs; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd4..e0942630d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "xen-ops.h" @@ -32,6 +33,56 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; + unsigned long start, end; + unsigned long len; + unsigned long pfn; + int ret; + + start = PFN_UP(start_addr); + end = PFN_UP(end_addr); + + if (end <= start) + return 0; + + len = end - start; + + set_xen_guest_handle(reservation.extent_start, &mfn_list[start]); + reservation.nr_extents = len; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + + for(pfn = start; pfn < end; pfn++) + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + + return len; +} + +static unsigned long __init xen_return_unused_memory(const struct e820map *e820) +{ + unsigned long last_end = 0; + unsigned long released = 0; + int i; + + for (i = 0; i < e820->nr_map; i++) { + released += xen_release_chunk(last_end, e820->map[i].addr); + last_end = e820->map[i].addr + e820->map[i].size; + } + + released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages)); + + printk(KERN_INFO "released %ld pages of unused memory\n", released); + return released; +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. @@ -67,6 +118,8 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + xen_return_unused_memory(&e820); + return "Xen"; } -- cgit From f89e048e76da7ac0b4c89e75606ca7a3422886b1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 16 Sep 2009 12:38:33 -0700 Subject: xen: make sure pages are really part of domain before freeing Scan the set of pages we're freeing and make sure they're actually owned by the domain before freeing. This generally won't happen on a domU (since Xen gives us contigious memory), but it could happen if there are some hardware mappings passed through. We only bother going up to the highest page Xen actually claimed to give us, since there's definitely nothing of ours above that. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 59 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0942630d47a..9deb6bab6c78 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,52 +33,69 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); -static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, + phys_addr_t end_addr) { struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long start, end; - unsigned long len; + unsigned long len = 0; unsigned long pfn; int ret; start = PFN_UP(start_addr); - end = PFN_UP(end_addr); + end = PFN_DOWN(end_addr); if (end <= start) return 0; - len = end - start; - - set_xen_guest_handle(reservation.extent_start, &mfn_list[start]); - reservation.nr_extents = len; - - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); - WARN(ret != (end - start), "Failed to release memory %lx-%lx err=%d\n", - start, end, ret); - - for(pfn = start; pfn < end; pfn++) - set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", + start, end); + for(pfn = start; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + if (ret == 1) { + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + len++; + } + } + printk(KERN_CONT "%ld pages freed\n", len); return len; } -static unsigned long __init xen_return_unused_memory(const struct e820map *e820) +static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, + const struct e820map *e820) { - unsigned long last_end = 0; + phys_addr_t max_addr = PFN_PHYS(max_pfn); + phys_addr_t last_end = 0; unsigned long released = 0; int i; - for (i = 0; i < e820->nr_map; i++) { - released += xen_release_chunk(last_end, e820->map[i].addr); + for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { + phys_addr_t end = e820->map[i].addr; + end = min(max_addr, end); + + released += xen_release_chunk(last_end, end); last_end = e820->map[i].addr + e820->map[i].size; } - released += xen_release_chunk(last_end, PFN_PHYS(xen_start_info->nr_pages)); + if (last_end < max_addr) + released += xen_release_chunk(last_end, max_addr); printk(KERN_INFO "released %ld pages of unused memory\n", released); return released; @@ -118,7 +135,7 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - xen_return_unused_memory(&e820); + xen_return_unused_memory(xen_start_info->nr_pages, &e820); return "Xen"; } -- cgit From 5f755293ca61520b70b11afe1b1d6e1635cb6c00 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Jul 2010 15:19:48 -0700 Subject: x86, gcc-4.6: Avoid unused by set variables in rdmsr Avoids quite a lot of warnings with a gcc 4.6 -Wall build because this happens in a commonly used header file (apic.h) Signed-off-by: Andi Kleen LKML-Reference: <201007202219.o6KMJme6021066@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c5bc4c2d33f5..084ef95274cd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -148,8 +148,8 @@ static inline unsigned long long native_read_pmc(int counter) #define rdmsr(msr, val1, val2) \ do { \ u64 __val = native_read_msr((msr)); \ - (val1) = (u32)__val; \ - (val2) = (u32)(__val >> 32); \ + (void)((val1) = (u32)__val); \ + (void)((val2) = (u32)(__val >> 32)); \ } while (0) static inline void wrmsr(unsigned msr, unsigned low, unsigned high) -- cgit From fa10ba64ac94fec4611b79804023eb087862ffe0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Jul 2010 15:19:49 -0700 Subject: x86, gcc-4.6: Fix set but not read variables Just some dead code, no real bugs. Found by gcc 4.6 -Wall Signed-off-by: Andi Kleen LKML-Reference: <201007202219.o6KMJnQ0021072@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - u32 agp_aper_base = 0, agp_aper_order = 0; + u32 agp_aper_order = 0; int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; @@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ - agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; - int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ - cpu = get_cpu(); + get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); -- cgit From 7aa2b5f8ec60505160df1c25398e8286c8432689 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:48 +0200 Subject: x86, xsave: Do not include asm/i387.h in asm/xsave.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no dependencies to asm/i387.h. Instead, if including only xsave.h the following error occurs: .../arch/x86/include/asm/i387.h:110: error: ‘XSTATE_FP’ undeclared (first use in this function) .../arch/x86/include/asm/i387.h:110: error: (Each undeclared identifier is reported only once .../arch/x86/include/asm/i387.h:110: error: for each function it appears in.) This patch fixes this. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-2-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index ec86c5fd6a6e..94d5f84d89f2 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -3,7 +3,6 @@ #include #include -#include #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 -- cgit From db10db48b2c530def21bfd76d576702c7df7f620 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:49 +0200 Subject: x86, xsave: 32/64 bit boot cpu check unification in initialization Boot cpu id is always 0, thus simplifying and unifying boot cpu check. boot_cpu_id is there for historical reasons and was renamed to boot_cpu_physical_apicid in patch: c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid However, there are some remaining occurrences of boot_cpu_id that are never touched in the kernel and thus its value is always 0. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f715efc594d..26804b2986b8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1273,7 +1273,7 @@ void __cpuinit cpu_init(void) /* * Boot processor to setup the FP and extended state context info. */ - if (smp_processor_id() == boot_cpu_id) + if (!smp_processor_id()) init_thread_xstate(); xsave_init(); -- cgit From 82d4150cec83b9775f84810b39a1c0b91585d429 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:51 +0200 Subject: x86, xsave: Move boot cpu initialization to xsave_init() This patch moves boot cpu initialization to xsave_init(). Now all cpus are initialized in one single function. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 6 ------ arch/x86/kernel/i387.c | 5 ----- arch/x86/kernel/xsave.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 26804b2986b8..40561085d4f3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1270,12 +1270,6 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 6106af9fd129..2f32ef05f10e 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -93,11 +93,6 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); xsave_init(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 368047c8d507..ab9ad48b6530 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,7 +360,7 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -void __cpuinit xsave_init(void) +static void __cpuinit __xsave_init(void) { if (!cpu_has_xsave) return; @@ -446,7 +446,7 @@ void __ref xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xsave_init(); + __xsave_init(); /* * Recompute the context size for enabled features @@ -463,3 +463,13 @@ void __ref xsave_cntxt_init(void) "cntxt size 0x%x\n", pcntxt_mask, xstate_size); } + +void __cpuinit xsave_init(void) +{ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + __xsave_init(); +} -- cgit From 92851e2fca48f1893f899963c13b55b61ac6956c Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 20 Jul 2010 15:19:46 -0700 Subject: x86, mm: Create symbolic index into address_markers array Without this, adding entries into the address_markers array means adding more and more of an #ifdef maze in pt_dump_init(). By using indices, we can keep it a bit saner. Signed-off-by: Andres Salomon LKML-Reference: <201007202219.o6KMJkUs021052@imap1.linux-foundation.org> Cc: Jordan Crouse Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/dump_pagetables.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a725b7f760ae..0002a3a33081 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -37,6 +37,28 @@ struct addr_marker { const char *name; }; +/* indices for address_markers; keep sync'd w/ address_markers below */ +enum address_markers_idx { + USER_SPACE_NR = 0, +#ifdef CONFIG_X86_64 + KERNEL_SPACE_NR, + LOW_KERNEL_NR, + VMALLOC_START_NR, + VMEMMAP_START_NR, + HIGH_KERNEL_NR, + MODULES_VADDR_NR, + MODULES_END_NR, +#else + KERNEL_SPACE_NR, + VMALLOC_START_NR, + VMALLOC_END_NR, +# ifdef CONFIG_HIGHMEM + PKMAP_BASE_NR, +# endif + FIXADDR_START_NR, +#endif +}; + /* Address space markers hints */ static struct addr_marker address_markers[] = { { 0, "User Space" }, @@ -331,14 +353,12 @@ static int pt_dump_init(void) #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ - address_markers[2].start_address = VMALLOC_START; - address_markers[3].start_address = VMALLOC_END; + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; # ifdef CONFIG_HIGHMEM - address_markers[4].start_address = PKMAP_BASE; - address_markers[5].start_address = FIXADDR_START; -# else - address_markers[4].start_address = FIXADDR_START; + address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE; # endif + address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, -- cgit From 468c30f2bbdf1ba0fbf16667eade23a46eaa8f06 Mon Sep 17 00:00:00 2001 From: Florian Zumbiehl Date: Tue, 20 Jul 2010 15:19:47 -0700 Subject: x86, iomap: Fix wrong page aligned size calculation in ioremapping code x86 early_iounmap(): fix off-by-one error in page alignment of allocation size for sizes where size%PAGE_SIZE==1. Signed-off-by: Florian Zumbiehl LKML-Reference: <201007202219.o6KMJlES021058@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d41d3a9036ca..3ba6e0608c55 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -611,7 +611,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) return; } offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { -- cgit From a751bd858b16dce57f3b6b85ba07946df1bd7be4 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 20 Jul 2010 15:19:45 -0700 Subject: x86, rwsem: Stay on fast path when count > 0 in __up_write() When count > 0 there is no need to take the call_rwsem_wake path. If we did take that path, it would just return without doing anything due to the active count not being zero. Signed-off-by: Michel Lespinasse LKML-Reference: <201007202219.o6KMJj9x021042@imap1.linux-foundation.org> Acked-by: David Howells Cc: Mike Waychison Cc: Suleiman Souhlal Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 606ede126972..5bf5e04e497f 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -216,9 +216,8 @@ static inline void __up_write(struct rw_semaphore *sem) rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" - /* tries to transition - 0xffff0001 -> 0x00000000 */ - " jz 1f\n" + /* subtracts 0xffff0001, returns the old value */ + " jns 1f\n\t" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" -- cgit From b4bcb4c28c64cc2876b4aef218d992ce806194da Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 20 Jul 2010 15:19:45 -0700 Subject: x86, rwsem: Minor cleanups Clarified few comments and made initialization of %edx/%rdx more uniform accross __down_write_nested, __up_read and __up_write functions. Signed-off-by: Michel Lespinasse LKML-Reference: <201007202219.o6KMJkiA021048@imap1.linux-foundation.org> Acked-by: David Howells Cc: Mike Waychison Cc: Suleiman Souhlal Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 5bf5e04e497f..d1e41b0f9b60 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -118,7 +118,7 @@ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" LOCK_PREFIX _ASM_INC "(%1)\n\t" - /* adds 0x00000001, returns the old value */ + /* adds 0x00000001 */ " jns 1f\n" " call call_rwsem_down_read_failed\n" "1:\n\t" @@ -156,11 +156,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { rwsem_count_t tmp; - - tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" - /* subtract 0x0000ffff, returns the old value */ + /* adds 0xffff0001, returns the old value */ " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" @@ -168,7 +166,7 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) "1:\n" "# ending down_write" : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) + : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); } @@ -195,16 +193,16 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" - " call call_rwsem_wake\n" + " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n" "# ending __up_read\n" : "+m" (sem->count), "=d" (tmp) - : "a" (sem), "1" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); } @@ -218,7 +216,7 @@ static inline void __up_write(struct rw_semaphore *sem) LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 0xffff0001, returns the old value */ " jns 1f\n\t" - " call call_rwsem_wake\n" + " call call_rwsem_wake\n" /* expects old value in %edx */ "1:\n\t" "# ending __up_write\n" : "+m" (sem->count), "=d" (tmp) -- cgit From 3f8afb77cd8a672f024e4a16763ef177bc16c8f8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 21 Jul 2010 14:47:05 +0200 Subject: x86, tlb: Clean up and correct used type smp_processor_id() returns an int and not an unsigned long. Also, since the function is small enough, there's no need for a local variable caching its value. No functionality change, just cleanup. Signed-off-by: Borislav Petkov LKML-Reference: <20100721124705.GA674@aftab> Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 426f3a1a64d3..c03f14ab6667 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -278,11 +278,9 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) static void do_flush_tlb_all(void *info) { - unsigned long cpu = smp_processor_id(); - __flush_tlb_all(); if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) - leave_mm(cpu); + leave_mm(smp_processor_id()); } void flush_tlb_all(void) -- cgit From 0e49bf66d2ca649b167428adddbbbe9d9bd4894c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:52 +0200 Subject: x86, xsave: Separate fpu and xsave initialization As xsave also supports other than fpu features, it should be initialized independently of the fpu. This patch moves this out of fpu initialization. There is also a lot of cross referencing between fpu and xsave code. This patch reduces this by making xsave_cntxt_init() and init_thread_xstate() static functions. The patch moves the cpu_has_xsave check at the beginning of xsave_init(). All other checks may removed then. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-2-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 1 - arch/x86/include/asm/xsave.h | 1 - arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/i387.c | 27 +++++++++++++++++++-------- arch/x86/kernel/xsave.c | 10 +++++----- 5 files changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 59bd93ac7fef..509ddabeae25 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -31,7 +31,6 @@ extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); extern void __math_state_restore(void); -extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 94d5f84d89f2..4d3b5d1fc028 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -28,7 +28,6 @@ extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern void xsave_cntxt_init(void); extern void xsave_init(void); extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 40561085d4f3..94c36c7ac183 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1210,6 +1210,7 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); + xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1270,6 +1271,7 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); + fpu_init(); xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2f32ef05f10e..e73c54ebafce 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) stts(); } -void __cpuinit init_thread_xstate(void) +static void __cpuinit init_thread_xstate(void) { + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + if (!HAVE_HWFP) { xstate_size = sizeof(struct i387_soft_struct); return; } - if (cpu_has_xsave) { - xsave_cntxt_init(); - return; - } - if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ + void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); @@ -93,14 +94,24 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - xsave_init(); + if (!smp_processor_id()) + init_thread_xstate(); mxcsr_feature_mask_init(); /* clean state in init */ current_thread_info()->status = 0; clear_used_math(); } -#endif /* CONFIG_X86_64 */ + +#else /* CONFIG_X86_64 */ + +void __cpuinit fpu_init(void) +{ + if (!smp_processor_id()) + init_thread_xstate(); +} + +#endif /* CONFIG_X86_32 */ static void fpu_finit(struct fpu *fpu) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ab9ad48b6530..550bf45236f4 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -362,9 +362,6 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); */ static void __cpuinit __xsave_init(void) { - if (!cpu_has_xsave) - return; - set_in_cr4(X86_CR4_OSXSAVE); /* @@ -429,7 +426,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -void __ref xsave_cntxt_init(void) +static void __cpuinit xsave_cntxt_init(void) { unsigned int eax, ebx, ecx, edx; @@ -466,10 +463,13 @@ void __ref xsave_cntxt_init(void) void __cpuinit xsave_init(void) { + if (!cpu_has_xsave) + return; + /* * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - init_thread_xstate(); + xsave_cntxt_init(); __xsave_init(); } -- cgit From 97e80a70db689fb1e876df9f12305cc72f85ca53 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:53 +0200 Subject: x86, xsave: Introduce xstate enable functions The patch renames xsave_cntxt_init() and __xsave_init() into xstate_enable_boot_cpu() and xstate_enable() as this names are more meaningful. It also removes the duplicate xcr setup for the boot cpu. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 550bf45236f4..2322f586c051 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,15 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static void __cpuinit __xsave_init(void) +static inline void xstate_enable(u64 mask) { set_in_cr4(X86_CR4_OSXSAVE); - - /* - * Enable all the features that the HW is capable of - * and the Linux kernel is aware of. - */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); } /* @@ -426,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xsave_cntxt_init(void) +static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -443,7 +438,8 @@ static void __cpuinit xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - __xsave_init(); + + xstate_enable(pcntxt_mask); /* * Recompute the context size for enabled features @@ -470,6 +466,7 @@ void __cpuinit xsave_init(void) * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - xsave_cntxt_init(); - __xsave_init(); + xstate_enable_boot_cpu(); + else + xstate_enable(pcntxt_mask); } -- cgit From ee813d53a8e980a3a28318efb8935d45723f5211 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:54 +0200 Subject: x86, xsave: Check cpuid level for XSTATE_CPUID (0x0d) The patch introduces the XSTATE_CPUID macro and adds a check that tests if XSTATE_CPUID exists. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-4-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 2 ++ arch/x86/kernel/xsave.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 4d3b5d1fc028..d1b5f3a2fa20 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -4,6 +4,8 @@ #include #include +#define XSTATE_CPUID 0x0000000d + #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 2322f586c051..5adb7fb408f0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -379,7 +379,7 @@ static void setup_xstate_features(void) xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); do { - cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); if (eax == 0) break; @@ -425,7 +425,12 @@ static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { @@ -444,7 +449,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) /* * Recompute the context size for enabled features */ - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; update_regset_xstate_info(xstate_size, pcntxt_mask); -- cgit From 45c2d7f46211a0b1f6b425c59575c53145afc4b4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:55 +0200 Subject: x86, xsave: Make init_xstate_buf static The pointer is only used in xsave.c. Making it static. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/xsave.h | 1 - arch/x86/kernel/xsave.c | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index d1b5f3a2fa20..0ae6b9961985 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,7 +27,6 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; -extern struct xsave_struct *init_xstate_buf; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_init(void); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 5adb7fb408f0..3b44a9b1eca4 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -16,6 +16,11 @@ */ u64 pcntxt_mask; +/* + * Represents init state for the supported extended state. + */ +static struct xsave_struct *init_xstate_buf; + struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION struct _fpx_sw_bytes fx_sw_reserved_ia32; @@ -348,11 +353,6 @@ static void prepare_fx_sw_frame(void) #endif } -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - #ifdef CONFIG_X86_64 unsigned int sig_xstate_size = sizeof(struct _fpstate); #endif -- cgit From 4995b9dba908436c1611454f9bd2cb3ddf6babee Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:56 +0200 Subject: x86, xsave: Add __init attribute to setup_xstate_features() This is called only from initialization code. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-6-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3b44a9b1eca4..cfc7901ee94e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -370,7 +370,7 @@ static inline void xstate_enable(u64 mask) * Record the offsets and sizes of different state managed by the xsave * memory layout. */ -static void setup_xstate_features(void) +static void __init setup_xstate_features(void) { int eax, ebx, ecx, edx, leaf = 0x2; -- cgit From 1cff92d8fdb27684308864d9cdb324bee43b40ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 21 Jul 2010 14:23:10 -0700 Subject: x86, xsave: Make xstate_enable_boot_cpu() __init, protect on CPU 0 xstate_enable_boot_cpu() is, as the name implies, only used on the boot CPU; furthermore, it invokes alloc_bootmem(), which is __init; hence it needs to be tagged __init rather than __cpuinit. Furthermore, it is *not* safe in the long run to rely on CPU 0 only coming online during the early boot -- at some point we're going to support offlining (and re-onlining) the boot CPU, and at that point we must not call xstate_enable_boot_cpu() again. The code is a fair bit more obscure than one would like, because the __ref overrides aren't quite powerful enough. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha Cc: Robert Richter LKML-Reference: <4C476236.1020302@zytor.com> --- arch/x86/kernel/xsave.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index cfc7901ee94e..b2549c3eb2c3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,10 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static inline void xstate_enable(u64 mask) +static inline void xstate_enable(void) { set_in_cr4(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* @@ -421,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xstate_enable_boot_cpu(void) +static void __init xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -444,7 +444,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xstate_enable(pcntxt_mask); + xstate_enable(); /* * Recompute the context size for enabled features @@ -462,16 +462,22 @@ static void __cpuinit xstate_enable_boot_cpu(void) pcntxt_mask, xstate_size); } +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ void __cpuinit xsave_init(void) { + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + if (!cpu_has_xsave) return; - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - xstate_enable_boot_cpu(); - else - xstate_enable(pcntxt_mask); + this_func = next_func; + next_func = xstate_enable; + this_func(); } -- cgit From 8c06585d6431addadd94903843dfbcd315b42d4e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:26 -0400 Subject: x86: Remove redundant K6 MSRs MSR_K6_EFER is unused, and MSR_K6_STAR is redundant with MSR_STAR. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-1-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 2 -- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 8 ++++---- arch/x86/kvm/x86.c | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae4318629..6068e0e06e00 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -159,8 +159,6 @@ #define MSR_K7_FID_VID_STATUS 0xc0010042 /* K6 MSRs */ -#define MSR_K6_EFER 0xc0000080 -#define MSR_K6_STAR 0xc0000081 #define MSR_K6_WHCR 0xc0000082 #define MSR_K6_UWCCR 0xc0000085 #define MSR_K6_EPMR 0xc0000086 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..24a220696298 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -130,7 +130,7 @@ static struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ } direct_access_msrs[] = { - { .index = MSR_K6_STAR, .always = true }, + { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 { .index = MSR_GS_BASE, .always = true }, @@ -2431,7 +2431,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = tsc_offset + native_read_tsc(); break; } - case MSR_K6_STAR: + case MSR_STAR: *data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 @@ -2555,7 +2555,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; } - case MSR_K6_STAR: + case MSR_STAR: svm->vmcb->save.star = data; break; #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..b42ad25d5647 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -231,14 +231,14 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* - * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -1057,10 +1057,10 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0 && vmx->rdtscp_enabled) move_msr_up(vmx, index, save_nmsrs++); /* - * MSR_K6_STAR is only needed on long mode guests, and only + * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_K6_STAR); + index = __find_msr_index(vmx, MSR_STAR); if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05d571f6f196..6127468ebbd2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -671,7 +671,7 @@ static u32 msrs_to_save[] = { HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, + MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif -- cgit From cfaa71ee9794472598d3966c3315cd6bd8f953d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:27 -0400 Subject: x86: Use symbolic MSR names Use symbolic MSR names instead of hardcoding the MSR index. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-2-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/realmode/wakeup.S | 2 +- arch/x86/kernel/verify_cpu_64.S | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -104,7 +104,7 @@ _start: movl %eax, %ecx orl %edx, %ecx jz 1f - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S @@ -31,6 +31,7 @@ */ #include +#include verify_cpu: pushfl # Save caller passed flags @@ -88,7 +89,7 @@ verify_cpu_sse_test: je verify_cpu_sse_ok test %di,%di jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR + movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr -- cgit From 650fb4393dff543bc980d361555c489fbdeed088 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:28 -0400 Subject: x86-64: Simplify loading initial_gs Load initial_gs as two 32-bit values instead of splitting a 64-bit value. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-3-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) * init data section till per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq initial_gs(%rip),%rax - movq %rax,%rdx - shrq $32,%rdx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx wrmsr /* esi is pointer to real mode structure with interesting info. -- cgit From 18f19aa62a267f2f759e278018f1032adf4c3774 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 14 May 2010 12:38:24 +0100 Subject: xen: Add support for HVM hypercalls. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Sheng Yang Signed-off-by: Stefano Stabellini --- arch/x86/include/asm/xen/hypercall.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 9c371e4a9fa6..7fda040a76cd 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) return _hypercall2(int, nmi_op, op, arg); } +static inline unsigned long __must_check +HYPERVISOR_hvm_op(int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { -- cgit From bee6ab53e652a414af20392899879b58cd80d033 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:39:33 +0100 Subject: x86: early PV on HVM features initialization. Initialize basic pv on hvm features adding a new Xen HVM specific hypervisor_x86 structure. Don't try to initialize xen-kbdfront and xen-fbfront when running on HVM because the backends are not available. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/hypervisor.h | 1 + arch/x86/kernel/cpu/hypervisor.c | 1 + arch/x86/xen/enlighten.c | 100 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 70abda7058c8..ff2546ce7178 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; #endif diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc56a8f..bffd47c10fed 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,6 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, + &x86_hyper_xen_hvm, }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a8..09b36e9d507a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,9 @@ #include #include #include +#include #include +#include #include "xen-ops.h" #include "mmu.h" @@ -76,6 +79,8 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; +RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -1206,3 +1211,98 @@ asmlinkage void __init xen_start_kernel(void) x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } + +static uint32_t xen_cpuid_base(void) +{ + uint32_t base, eax, ebx, ecx, edx; + char signature[13]; + + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &ebx, &ecx, &edx); + *(uint32_t *)(signature + 0) = ebx; + *(uint32_t *)(signature + 4) = ecx; + *(uint32_t *)(signature + 8) = edx; + signature[12] = 0; + + if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) + return base; + } + + return 0; +} + +static int init_hvm_pv_info(int *major, int *minor) +{ + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + *major = eax >> 16; + *minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info = xen_info; + pv_info.kernel_rpl = 0; + + xen_domain_type = XEN_HVM_DOMAIN; + + return 0; +} + +static void __init init_shared_info(void) +{ + struct xen_add_to_physmap xatp; + struct shared_info *shared_info_page; + + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; +} + +static void __init xen_hvm_guest_init(void) +{ + int r; + int major, minor; + + r = init_hvm_pv_info(&major, &minor); + if (r < 0) + return; + + init_shared_info(); +} + +static bool __init xen_hvm_platform(void) +{ + if (xen_pv_domain()) + return false; + + if (!xen_cpuid_base()) + return false; + + return true; +} + +const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { + .name = "Xen HVM", + .detect = xen_hvm_platform, + .init_platform = xen_hvm_guest_init, +}; +EXPORT_SYMBOL(x86_hyper_xen_hvm); -- cgit From 38e20b07efd541a959de367dc90a17f92ce2e8a6 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:40:51 +0100 Subject: x86/xen: event channels delivery on HVM. Set the callback to receive evtchns from Xen, using the callback vector delivery mechanism. The traditional way for receiving event channel notifications from Xen is via the interrupts from the platform PCI device. The callback vector is a newer alternative that allow us to receive notifications on any vcpu and doesn't need any PCI support: we allocate a vector exclusively to receive events, in the vector handler we don't need to interact with the vlapic, therefore we avoid a VMEXIT. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/irq_vectors.h | 3 +++ arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ arch/x86/xen/enlighten.c | 28 ++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 2 ++ 5 files changed, 39 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 8767d99c4f64..e2ca30092557 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -125,6 +125,9 @@ */ #define MCE_SELF_VECTOR 0xeb +/* Xen vector callback to receive events in a HVM domain */ +#define XEN_HVM_EVTCHN_CALLBACK 0xe9 + #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..6b196834a0dd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1166,6 +1166,9 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0697ff139837..490ae2bb18a8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 09b36e9d507a..b211a04c4b2c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -11,6 +11,7 @@ * Jeremy Fitzhardinge , XenSource Inc, 2007 */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include @@ -80,6 +82,8 @@ struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* * Point at some empty memory to start with. We map the real shared_info @@ -1277,6 +1281,24 @@ static void __init init_shared_info(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; } +static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { + .notifier_call = xen_hvm_cpu_notify, +}; + static void __init xen_hvm_guest_init(void) { int r; @@ -1287,6 +1309,12 @@ static void __init xen_hvm_guest_init(void) return; init_shared_info(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + register_cpu_notifier(&xen_hvm_cpu_notifier); + have_vcpu_info_placement = 0; + x86_init.irqs.intr_init = xen_init_IRQ; } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bce..0d0e0e6a7479 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -38,6 +38,8 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); + void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); -- cgit From 016b6f5fe8398b0291cece60b749d7c930a2e09c Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:45:07 +0100 Subject: xen: Add suspend/resume support for PV on HVM guests. Suspend/resume requires few different things on HVM: the suspend hypercall is different; we don't need to save/restore memory related settings; except the shared info page and the callback mechanism. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 24 ++++++++++++++++++------ arch/x86/xen/suspend.c | 6 ++++++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b211a04c4b2c..127c95c8d15e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1262,13 +1262,15 @@ static int init_hvm_pv_info(int *major, int *minor) return 0; } -static void __init init_shared_info(void) +void xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; - struct shared_info *shared_info_page; + static struct shared_info *shared_info_page = 0; - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; @@ -1278,7 +1280,17 @@ static void __init init_shared_info(void) HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + } } static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, @@ -1308,7 +1320,7 @@ static void __init xen_hvm_guest_init(void) if (r < 0) return; - init_shared_info(); + xen_hvm_init_shared_info(); if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index a9c661108034..d07479c340f5 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -26,6 +26,12 @@ void xen_pre_suspend(void) BUG(); } +void xen_hvm_post_suspend(int suspend_cancelled) +{ + xen_hvm_init_shared_info(); + xen_callback_vector(); +} + void xen_post_suspend(int suspend_cancelled) { xen_build_mfn_list_list(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0d0e0e6a7479..01c9dd386522 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -39,6 +39,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); +void xen_hvm_init_shared_info(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit From 409771d258e9dd71c30f3c9520fd2b796ffc40f0 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:48:19 +0100 Subject: x86: Use xen_vcpuop_clockevent, xen_clocksource and xen wallclock. Use xen_vcpuop_clockevent instead of hpet and APIC timers as main clockevent device on all vcpus, use the xen wallclock time as wallclock instead of rtc and use xen_clocksource as clocksource. The pv clock algorithm needs to work correctly for the xen_clocksource and xen wallclock to be usable, only modern Xen versions offer a reliable pv clock in HVM guests (XENFEAT_hvm_safe_pvclock). Using the hpet as clocksource means a VMEXIT every time we read/write to the hpet mmio addresses, pvclock give us a better rating without VMEXITs. Same goes for the xen wallclock and xen_vcpuop_clockevent Signed-off-by: Stefano Stabellini Signed-off-by: Don Dutile Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 14 ++---------- arch/x86/xen/suspend.c | 6 +++++ arch/x86/xen/time.c | 58 +++++++++++++++++++++++++++++++++++++++++++----- arch/x86/xen/xen-ops.h | 7 ++---- 4 files changed, 63 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 127c95c8d15e..a90172963884 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -935,10 +935,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, }; -static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, -}; - static const struct pv_cpu_ops xen_cpu_ops __initdata = { .cpuid = xen_cpuid, @@ -1076,7 +1072,6 @@ asmlinkage void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; @@ -1084,13 +1079,7 @@ asmlinkage void __init xen_start_kernel(void) x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; - x86_init.timers.timer_init = xen_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - x86_cpuinit.setup_percpu_clockev = x86_init_noop; - - x86_platform.calibrate_tsc = xen_tsc_khz; - x86_platform.get_wallclock = xen_get_wallclock; - x86_platform.set_wallclock = xen_set_wallclock; + xen_init_time_ops(); /* * Set up some pagetable state before starting to set any ptes. @@ -1327,6 +1316,7 @@ static void __init xen_hvm_guest_init(void) register_cpu_notifier(&xen_hvm_cpu_notifier); have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d07479c340f5..1d789d56877c 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -28,8 +28,14 @@ void xen_pre_suspend(void) void xen_hvm_post_suspend(int suspend_cancelled) { + int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { + for_each_online_cpu(cpu) { + xen_setup_runstate_info(cpu); + } + } } void xen_post_suspend(int suspend_cancelled) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3c6c59ed302..4780e55886a5 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -160,7 +161,7 @@ static void do_stolen_accounting(void) * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED * states. */ -unsigned long long xen_sched_clock(void) +static unsigned long long xen_sched_clock(void) { struct vcpu_runstate_info state; cycle_t now; @@ -195,7 +196,7 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ -unsigned long xen_tsc_khz(void) +static unsigned long xen_tsc_khz(void) { struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; @@ -230,7 +231,7 @@ static void xen_read_wallclock(struct timespec *ts) put_cpu_var(xen_vcpu); } -unsigned long xen_get_wallclock(void) +static unsigned long xen_get_wallclock(void) { struct timespec ts; @@ -238,7 +239,7 @@ unsigned long xen_get_wallclock(void) return ts.tv_sec; } -int xen_set_wallclock(unsigned long now) +static int xen_set_wallclock(unsigned long now) { /* do nothing for domU */ return -1; @@ -473,7 +474,11 @@ void xen_timer_resume(void) } } -__init void xen_time_init(void) +static const struct pv_time_ops xen_time_ops __initdata = { + .sched_clock = xen_sched_clock, +}; + +static __init void xen_time_init(void) { int cpu = smp_processor_id(); struct timespec tp; @@ -497,3 +502,46 @@ __init void xen_time_init(void) xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } + +__init void xen_init_time_ops(void) +{ + pv_time_ops = xen_time_ops; + + x86_init.timers.timer_init = xen_time_init; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} + +static void xen_hvm_setup_cpu_clockevents(void) +{ + int cpu = smp_processor_id(); + xen_setup_runstate_info(cpu); + xen_setup_timer(cpu); + xen_setup_cpu_clockevents(); +} + +__init void xen_hvm_init_time_ops(void) +{ + /* vector callback is needed otherwise we cannot receive interrupts + * on cpu > 0 */ + if (!xen_have_vector_callback && num_present_cpus() > 1) + return; + if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { + printk(KERN_INFO "Xen doesn't support pvclock on HVM," + "disable pv timer\n"); + return; + } + + pv_time_ops = xen_time_ops; + x86_init.timers.setup_percpu_clockev = xen_time_init; + x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents; + + x86_platform.calibrate_tsc = xen_tsc_khz; + x86_platform.get_wallclock = xen_get_wallclock; + x86_platform.set_wallclock = xen_set_wallclock; +} + diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 01c9dd386522..089d18923d2b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -49,11 +49,8 @@ void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); -unsigned long xen_tsc_khz(void); -void __init xen_time_init(void); -unsigned long xen_get_wallclock(void); -int xen_set_wallclock(unsigned long time); -unsigned long long xen_sched_clock(void); +void __init xen_init_time_ops(void); +void __init xen_hvm_init_time_ops(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit From c1c5413ad58cb73267d328e6020268aa2e50d8ca Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Fri, 14 May 2010 12:44:30 +0100 Subject: x86: Unplug emulated disks and nics. Add a xen_emul_unplug command line option to the kernel to unplug xen emulated disks and nics. Set the default value of xen_emul_unplug depending on whether or not the Xen PV frontends and the Xen platform PCI driver have been compiled for this kernel (modules or built-in are both OK). The user can specify xen_emul_unplug=ignore to enable PV drivers on HVM even if the host platform doesn't support unplug. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/platform-pci-unplug.c | 135 +++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 1 + 4 files changed, 138 insertions(+), 1 deletion(-) create mode 100644 arch/x86/xen/platform-pci-unplug.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f2..930954685980 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -12,7 +12,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o + grant-table.o suspend.o platform-pci-unplug.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a90172963884..157c93b62dd4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1314,6 +1314,7 @@ static void __init xen_hvm_guest_init(void) if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; register_cpu_notifier(&xen_hvm_cpu_notifier); + xen_unplug_emulated_devices(); have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c new file mode 100644 index 000000000000..2f7f3fb34777 --- /dev/null +++ b/arch/x86/xen/platform-pci-unplug.c @@ -0,0 +1,135 @@ +/****************************************************************************** + * platform-pci-unplug.c + * + * Xen platform PCI device driver + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include + +#include + +#define XEN_PLATFORM_ERR_MAGIC -1 +#define XEN_PLATFORM_ERR_PROTOCOL -2 +#define XEN_PLATFORM_ERR_BLACKLIST -3 + +/* store the value of xen_emul_unplug after the unplug is done */ +int xen_platform_pci_unplug; +EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +static int xen_emul_unplug; + +static int __init check_platform_magic(void) +{ + short magic; + char protocol; + + magic = inw(XEN_IOPORT_MAGIC); + if (magic != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n"); + return XEN_PLATFORM_ERR_MAGIC; + } + + protocol = inb(XEN_IOPORT_PROTOVER); + + printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", + protocol); + + switch (protocol) { + case 1: + outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); + outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); + if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_ERR "Xen Platform: blacklisted by host\n"); + return XEN_PLATFORM_ERR_BLACKLIST; + } + break; + default: + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + return XEN_PLATFORM_ERR_PROTOCOL; + } + + return 0; +} + +void __init xen_unplug_emulated_devices(void) +{ + int r; + + /* check the version of the xen platform PCI device */ + r = check_platform_magic(); + /* If the version matches enable the Xen platform PCI driver. + * Also enable the Xen platform PCI driver if the version is really old + * and the user told us to ignore it. */ + if (r && !(r == XEN_PLATFORM_ERR_MAGIC && + (xen_emul_unplug & XEN_UNPLUG_IGNORE))) + return; + /* Set the default value of xen_emul_unplug depending on whether or + * not the Xen PV frontends and the Xen platform PCI driver have + * been compiled for this kernel (modules or built-in are both OK). */ + if (!xen_emul_unplug) { + if (xen_must_unplug_nics()) { + printk(KERN_INFO "Netfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated NICs.\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + } + if (xen_must_unplug_disks()) { + printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated disks.\n" + "You might have to change the root device\n" + "from /dev/hd[a-d] to /dev/xvd[a-d]\n" + "in your root= kernel command line option\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + } + } + /* Now unplug the emulated devices */ + if (!(xen_emul_unplug & XEN_UNPLUG_IGNORE)) + outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); + xen_platform_pci_unplug = xen_emul_unplug; +} + +static int __init parse_xen_emul_unplug(char *arg) +{ + char *p, *q; + int l; + + for (p = arg; p; p = q) { + q = strchr(p, ','); + if (q) { + l = q - p; + q++; + } else { + l = strlen(p); + } + if (!strncmp(p, "all", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL; + else if (!strncmp(p, "ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + else if (!strncmp(p, "aux-ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; + else if (!strncmp(p, "nics", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + else if (!strncmp(p, "ignore", l)) + xen_emul_unplug |= XEN_UNPLUG_IGNORE; + else + printk(KERN_WARNING "unrecognised option '%s' " + "in parameter 'xen_emul_unplug'\n", p); + } + return 0; +} +early_param("xen_emul_unplug", parse_xen_emul_unplug); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 089d18923d2b..ed776949024c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -40,6 +40,7 @@ void xen_vcpu_restore(void); void xen_callback_vector(void); void xen_hvm_init_shared_info(void); +void __init xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); -- cgit From 5915100106b8f14a38053ad6c03a664d208aeaa2 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 17 Jun 2010 14:22:52 +0100 Subject: x86: Call HVMOP_pagetable_dying on exit_mmap. When a pagetable is about to be destroyed, we notify Xen so that the hypervisor can clear the related shadow pagetable. Signed-off-by: Stefano Stabellini Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/xen/mmu.h | 1 + 3 files changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 157c93b62dd4..75b479a684fd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1318,6 +1318,7 @@ static void __init xen_hvm_guest_init(void) have_vcpu_info_placement = 0; x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); } static bool __init xen_hvm_platform(void) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..84648c1bf138 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include @@ -1941,6 +1942,38 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +static void xen_hvm_exit_mmap(struct mm_struct *mm) +{ + struct xen_hvm_pagetable_dying a; + int rc; + + a.domid = DOMID_SELF; + a.gpa = __pa(mm->pgd); + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + WARN_ON_ONCE(rc < 0); +} + +static int is_pagetable_dying_supported(void) +{ + struct xen_hvm_pagetable_dying a; + int rc = 0; + + a.domid = DOMID_SELF; + a.gpa = 0x00; + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); + if (rc < 0) { + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); + return 0; + } + return 1; +} + +void __init xen_hvm_init_mmu_ops(void) +{ + if (is_pagetable_dying_supported()) + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; +} + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 5fe6bc7f5ecf..fa938c4aa2f7 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -60,4 +60,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); +extern void xen_hvm_init_mmu_ops(void); #endif /* _XEN_MMU_H */ -- cgit From b43275d661baa5f1f72dacd9033d6eda09d9fe87 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 Jul 2010 10:38:45 -0700 Subject: xen/pvhvm: fix build problem when !CONFIG_XEN x86_hyper_xen_hvm is only defined when Xen is enabled in the kernel config. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/cpu/hypervisor.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index bffd47c10fed..5bccedcb9121 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_XEN &x86_hyper_xen_hvm, +#endif }; const struct hypervisor_x86 *x86_hyper; -- cgit From 8c73626ab28527b7eb7f3061c027fbfe530c488c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:18 -0700 Subject: x86: Fix vtime/file timestamp inconsistencies Due to vtime calling vgettimeofday(), its possible that an application could call time();create("stuff",O_RDRW); only to see the file's creation timestamp to be before the value returned by time. A similar way to reproduce the issue is to compare the vsyscall time() with the syscall time(), and observe ordering issues. The modified test case from Oleg Nesterov below can illustrate this: int main(void) { time_t sec1,sec2; do { sec1 = time(&sec2); sec2 = syscall(__NR_time, NULL); } while (sec1 <= sec2); printf("vtime: %d.000000\n", sec1); printf("time: %d.000000\n", sec2); return 0; } The proper fix is to make vtime use the same time value as current_kernel_time() (which is exported via update_vsyscall) instead of vgettime(). Thanks to Jiri Olsa for bringing up the issue and catching bugs in earlier verisons of this fix. Signed-off-by: John Stultz Cc: Jiri Olsa Cc: Oleg Nesterov LKML-Reference: <1279068988-21864-2-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dce0c3c5a787 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - struct timeval tv; + unsigned seq; time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, NULL); - result = tv.tv_sec; + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + result = __vsyscall_gtod_data.wall_time_sec; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + if (t) *t = result; return result; -- cgit From 592913ecb87a9e06f98ddb55b298f1a66bf94c6b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:20 -0700 Subject: time: Kill off CONFIG_GENERIC_TIME Now that all arches have been converted over to use generic time via clocksources or arch_gettimeoffset(), we can remove the GENERIC_TIME config option and simplify the generic code. Signed-off-by: John Stultz LKML-Reference: <1279068988-21864-4-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a66..546b610ad71c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,9 +72,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 -config GENERIC_TIME - def_bool y - config GENERIC_CMOS_UPDATE def_bool y @@ -2046,7 +2043,7 @@ config SCx200 config SCx200HR_TIMER tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" - depends on SCx200 && GENERIC_TIME + depends on SCx200 default y ---help--- This driver provides a clocksource built upon the on-chip -- cgit From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:23 -0700 Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset update_vsyscall() did not provide the wall_to_monotoinc offset, so arch specific implementations tend to reference wall_to_monotonic directly. This limits future cleanups in the timekeeping core, so this patch fixes the update_vsyscall interface to provide wall_to_monotonic, allowing wall_to_monotonic to be made static as planned in Documentation/feature-removal-schedule.txt Signed-off-by: John Stultz Cc: Martin Schwidefsky Cc: Anton Blanchard Cc: Paul Mackerras Cc: Tony Luck LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dce0c3c5a787..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,8 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, - u32 mult) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -- cgit From f12a15be63d1de9a35971f35f06b73088fa25c3a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:27 -0700 Subject: x86: Convert common clocksources to use clocksource_register_hz/khz This converts the most common of the x86 clocksources over to use clocksource_register_hz/khz. Signed-off-by: John Stultz LKML-Reference: <1279068988-21864-11-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 13 +++++++++---- arch/x86/kernel/tsc.c | 5 +---- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d731175..33dbcc4ec5ff 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -16,7 +16,6 @@ #include #define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ @@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 @@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; + u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -832,9 +831,15 @@ static int hpet_clocksource_register(void) * mult = (hpet_period * 2^shift)/10^6 * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); - clocksource_register(&clocksource_hpet); + /* Need to convert hpet_period (fsec/cyc) to cyc/sec: + * + * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) + * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period + */ + hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + do_div(hpet_freq, hpet_period); + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); return 0; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..ce8e50239332 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { .read = read_tsc, .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 @@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) static void __init init_tsc_clocksource(void) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + clocksource_register_khz(&clocksource_tsc, tsc_khz); } #ifdef CONFIG_X86_64 -- cgit From 80a506b8fdcfa868bb53eb740f928217d0966fc1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Jul 2010 17:14:24 +0200 Subject: x86/amd-iommu: Export cache-coherency capability This patch exports the capability of the AMD IOMMU to force cache coherency of DMA transactions through the IOMMU-API. This is required to disable some nasty hacks in KVM when this capability is not available. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 29dd3b9f2f09..fa044e1e30a2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, static int amd_iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap) { + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return 1; + } + return 0; } -- cgit From d2cb214551de8180542a04ec8c86c0c9412c5124 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 26 Mar 2010 15:37:50 -0700 Subject: xen/mmu: inhibit vmap aliases rather than trying to clear them out Rather than trying to deal with aliases once they appear, just completely inhibit them. Mostly the removal of aliases was managable, but it comes unstuck in xen_create_contiguous_region() because it gets executed at interrupt time (as a result of dma_alloc_coherent()), which causes all sorts of confusion in the vmap code, as it was never intended to be run in interrupt context. This has the unfortunate side effect of removing all the unmap batching the vmap code so carefully added, but that can't be helped. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb51402dd99a..ef5728dde8f3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1015,8 +1016,6 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page, read-only, and can be pinned. */ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { - vm_unmap_aliases(); - xen_mc_batch(); if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -1580,7 +1579,6 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - vm_unmap_aliases(); if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE && USE_SPLIT_PTLOCKS) @@ -2026,6 +2024,8 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; + + vmap_lazy_unmap = false; } /* Protected by xen_reservation_lock. */ @@ -2165,8 +2165,6 @@ int xen_create_contiguous_region(unsigned long vstart, unsigned int order, memset((void *) vstart, 0, PAGE_SIZE << order); - vm_unmap_aliases(); - spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Zap current PTEs, remembering MFNs. */ @@ -2204,8 +2202,6 @@ void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) memset((void *) vstart, 0, PAGE_SIZE << order); - vm_unmap_aliases(); - spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Find start MFN of contiguous extent. */ -- cgit From bbbe57386e857eb2a8d4abcae71063c819c06ff1 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 9 Feb 2010 14:30:55 -0500 Subject: pci-swiotlb-xen: Add glue code to setup dma_ops utilizing xen_swiotlb_* functions. We add the glue code that sets up a dma_ops structure with the xen_swiotlb_* functions. The code turns on xen_swiotlb flag when it detects it is running under Xen and it is either in privileged mode or the iommu=soft flag was passed in. It also disables the bare-metal SWIOTLB if the Xen-SWIOTLB has been enabled. Note: The Xen-SWIOTLB is only built when CONFIG_XEN is enabled. Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Jeremy Fitzhardinge Cc: FUJITA Tomonori Cc: Albert Herranz Cc: Ian Campbell --- arch/x86/include/asm/xen/swiotlb-xen.h | 14 ++++++++ arch/x86/xen/Makefile | 1 + arch/x86/xen/pci-swiotlb-xen.c | 58 ++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) create mode 100644 arch/x86/include/asm/xen/swiotlb-xen.h create mode 100644 arch/x86/xen/pci-swiotlb-xen.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h new file mode 100644 index 000000000000..1be1ab7d6a41 --- /dev/null +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -0,0 +1,14 @@ +#ifndef _ASM_X86_SWIOTLB_XEN_H +#define _ASM_X86_SWIOTLB_XEN_H + +#ifdef CONFIG_SWIOTLB_XEN +extern int xen_swiotlb; +extern int __init pci_xen_swiotlb_detect(void); +extern void __init pci_xen_swiotlb_init(void); +#else +#define xen_swiotlb (0) +static inline int __init pci_xen_swiotlb_detect(void) { return 0; } +static inline void __init pci_xen_swiotlb_init(void) { } +#endif + +#endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3bb4fc21f4f2..32af238055c3 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o +obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c new file mode 100644 index 000000000000..a013ec9d0c54 --- /dev/null +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -0,0 +1,58 @@ +/* Glue code to lib/swiotlb-xen.c */ + +#include +#include + +#include +#include + +int xen_swiotlb __read_mostly; + +static struct dma_map_ops xen_swiotlb_dma_ops = { + .mapping_error = xen_swiotlb_dma_mapping_error, + .alloc_coherent = xen_swiotlb_alloc_coherent, + .free_coherent = xen_swiotlb_free_coherent, + .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, + .sync_single_for_device = xen_swiotlb_sync_single_for_device, + .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = xen_swiotlb_sync_sg_for_device, + .map_sg = xen_swiotlb_map_sg_attrs, + .unmap_sg = xen_swiotlb_unmap_sg_attrs, + .map_page = xen_swiotlb_map_page, + .unmap_page = xen_swiotlb_unmap_page, + .dma_supported = xen_swiotlb_dma_supported, +}; + +/* + * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use xen_swiotlb (by the boot + * option). + */ +int __init pci_xen_swiotlb_detect(void) +{ + + /* If running as PV guest, either iommu=soft, or swiotlb=force will + * activate this IOMMU. If running as PV privileged, activate it + * irregardlesss. + */ + if ((xen_initial_domain() || swiotlb || swiotlb_force) && + (xen_pv_domain())) + xen_swiotlb = 1; + + /* If we are running under Xen, we MUST disable the native SWIOTLB. + * Don't worry about swiotlb_force flag activating the native, as + * the 'swiotlb' flag is the only one turning it on. */ + if (xen_pv_domain()) + swiotlb = 0; + + return xen_swiotlb; +} + +void __init pci_xen_swiotlb_init(void) +{ + if (xen_swiotlb) { + xen_swiotlb_init(1); + dma_ops = &xen_swiotlb_dma_ops; + } +} -- cgit From 113fc5a6e8c2288619ff7e8187a6f556b7e0d372 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 17:01:49 -0700 Subject: x86: Add memory modify constraints to xchg() and cmpxchg() xchg() and cmpxchg() modify their memory operands, not merely read them. For some versions of gcc the "memory" clobber has apparently dealt with the situation, but not for all. Originally-by: Linus Torvalds Signed-off-by: H. Peter Anvin Cc: Glauber Costa Cc: Avi Kivity Cc: Peter Palfrader Cc: Greg KH Cc: Alan Cox Cc: Zachary Amsden Cc: Marcelo Tosatti Cc: LKML-Reference: <4C4F7277.8050306@zytor.com> --- arch/x86/include/asm/cmpxchg_32.h | 68 +++++++++++++++++++-------------------- arch/x86/include/asm/cmpxchg_64.h | 40 +++++++++++------------ 2 files changed, 54 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 8859e12dd3cf..c1cf59d72f09 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -27,20 +27,20 @@ struct __xchg_dummy { switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -70,14 +70,14 @@ static inline void __set_64bit(unsigned long long *ptr, unsigned int low, unsigned int high) { asm volatile("\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%0)\n\t" + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + LOCK_PREFIX "cmpxchg8b (%1)\n\t" "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) + : "=m" (*ptr) + : "D" (ptr), + "b" (low), + "c" (high) : "ax", "dx", "memory"); } @@ -121,21 +121,21 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ @@ -180,12 +180,12 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile(LOCK_PREFIX "cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } @@ -195,12 +195,12 @@ static inline unsigned long long __cmpxchg64_local(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile("cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile("cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 485ae415faec..b92f147339f3 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -26,26 +26,26 @@ extern void __cmpxchg_wrong_size(void); switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %k0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 8: \ asm volatile("xchgq %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -71,27 +71,27 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %k1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %k2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 8: \ - asm volatile(lock "cmpxchgq %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ -- cgit From c7f52cdc2f3e1733d3864e439ac2e92edd99ef31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 22 Jul 2010 22:58:01 -0700 Subject: support multiple .discard.* sections to avoid section type conflicts gcc 4.4.4 will complain if you use a .discard section for both text and data ("causes a section type conflict"). Add support for ".discard.*" sections, and use .discard.text for a dummy function in the x86 RESERVE_BRK() macro. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 86b1506f4179..ef292c792d74 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -82,7 +82,7 @@ void *extend_brk(size_t size, size_t align); * executable.) */ #define RESERVE_BRK(name,sz) \ - static void __section(.discard) __used \ + static void __section(.discard.text) __used \ __brk_reservation_fn_##name##__(void) { \ asm volatile ( \ ".pushsection .brk_reservation,\"aw\",@nobits;" \ -- cgit From 69309a05907546fb686b251d4ab041c26afe1e1d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 23:29:52 -0700 Subject: x86, asm: Clean up and simplify set_64bit() Clean up and simplify set_64bit(). This code is quite old (1.3.11) and contains a fair bit of auxilliary machinery that current versions of gcc handle just fine automatically. Worse, the auxilliary machinery can actually cause an unnecessary spill to memory. Furthermore, the loading of the old value inside the loop in the 32-bit case is unnecessary: if the value doesn't match, the CMPXCHG8B instruction will already have loaded the "new previous" value for us. Clean up the comment, too, and remove page references to obsolete versions of the Intel SDM. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 67 ++++++++++++--------------------------- arch/x86/include/asm/cmpxchg_64.h | 4 +-- 2 files changed, 21 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index c1cf59d72f09..20955ea7bc12 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -53,60 +53,33 @@ struct __xchg_dummy { __xchg((v), (ptr), sizeof(*ptr)) /* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) + * CMPXCHG8B only writes to the target if we had the previous + * value in registers, otherwise it acts as a read and gives us the + * "new previous" value. That is why there is a loop. Preloading + * EDX:EAX is a performance optimization: in the common case it means + * we need only one locked operation. * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. + * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very + * least an FPU save and/or %cr0.ts manipulation. + * + * cmpxchg8b must be used with the lock prefix here to allow the + * instruction to be executed atomically. We need to have the reader + * side to see the coherent 64bit value. */ -static inline void __set_64bit(unsigned long long *ptr, - unsigned int low, unsigned int high) +static inline void set_64bit(volatile u64 *ptr, u64 value) { + u32 low = value; + u32 high = value >> 32; + u64 prev = *ptr; + asm volatile("\n1:\t" - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%1)\n\t" + LOCK_PREFIX "cmpxchg8b %0\n\t" "jnz 1b" - : "=m" (*ptr) - : "D" (ptr), - "b" (low), - "c" (high) - : "ax", "dx", "memory"); -} - -static inline void __set_64bit_constant(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); -} - -#define ll_low(x) *(((unsigned int *)&(x)) + 0) -#define ll_high(x) *(((unsigned int *)&(x)) + 1) - -static inline void __set_64bit_var(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, ll_low(value), ll_high(value)); + : "=m" (*ptr), "+A" (prev) + : "b" (low), "c" (high) + : "memory"); } -#define set_64bit(ptr, value) \ - (__builtin_constant_p((value)) \ - ? __set_64bit_constant((ptr), (value)) \ - : __set_64bit_var((ptr), (value))) - -#define _set_64bit(ptr, value) \ - (__builtin_constant_p(value) \ - ? __set_64bit(ptr, (unsigned int)(value), \ - (unsigned int)((value) >> 32)) \ - : __set_64bit(ptr, ll_low((value)), ll_high((value)))) - extern void __cmpxchg_wrong_size(void); /* diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index b92f147339f3..9596e7c61960 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -5,13 +5,11 @@ #define __xg(x) ((volatile long *)(x)) -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; } -#define _set_64bit set_64bit - extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); -- cgit From 18642a57df02a044b91219d3176128996ddc81a5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Jul 2010 23:52:29 -0700 Subject: x86, vdso: Don't quote $nm in the script for checking vdso references Don't quote $nm in the script for checking the vdso for external references. Doing so breaks multiword constructs, like using CROSS_COMPILE='ccache '. Reported-by: Stephen Rothwell Signed-off-by: H. Peter Anvin LKML-Reference: <20100728134252.2e4c27cf.sfr@canb.auug.org.au> --- arch/x86/vdso/checkundef.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh index 490be1c38f94..7ee90a9b549d 100755 --- a/arch/x86/vdso/checkundef.sh +++ b/arch/x86/vdso/checkundef.sh @@ -1,7 +1,7 @@ #!/bin/sh nm="$1" file="$2" -"$nm" "$file" | grep '^ *U' > /dev/null 2>&1 +$nm "$file" | grep '^ *U' > /dev/null 2>&1 if [ $? -eq 1 ]; then exit 0 else -- cgit From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:25 -0500 Subject: fanotify: fanotify_init syscall declaration This patch defines a new syscall fanotify_init() of the form: int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags, unsigned int priority) This syscall is used to create and fanotify group. This is very similar to the inotify_init() syscall. Signed-off-by: Eric Paris --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa3..586cb3be2e32 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -842,4 +842,5 @@ ia32_sys_call_table: .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open .quad compat_sys_recvmmsg + .quad sys_fanotify_init ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index beb9b5f8f8a4..981c7e7ad804 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -343,10 +343,11 @@ #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 +#define __NR_fanotify_init 338 #ifdef __KERNEL__ -#define NR_syscalls 338 +#define NR_syscalls 339 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index ff4307b0e81e..4f23e04bdb34 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) __SYSCALL(__NR_perf_event_open, sys_perf_event_open) #define __NR_recvmmsg 299 __SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_fanotify_init 300 +__SYSCALL(__NR_fanotify_init, sys_fanotify_init) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..e38793b50e1d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_fanotify_init -- cgit From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:26 -0500 Subject: fanotify: sys_fanotify_mark declartion This patch simply declares the new sys_fanotify_mark syscall int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask, int dfd const char *pathname) Signed-off-by: Eric Paris --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/ia32/sys_ia32.c | 9 +++++++++ arch/x86/include/asm/sys_ia32.h | 3 +++ arch/x86/include/asm/unistd_32.h | 3 ++- arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 6 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 586cb3be2e32..17cf65c94804 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -843,4 +843,5 @@ ia32_sys_call_table: .quad sys_perf_event_open .quad compat_sys_recvmmsg .quad sys_fanotify_init + .quad sys32_fanotify_mark ia32_syscall_end: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 626be156d88d..3d093311d5e2 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -546,3 +546,12 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); } + +asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags, + u32 mask_lo, u32 mask_hi, + int fd, const char __user *pathname) +{ + return sys_fanotify_mark(fanotify_fd, flags, + ((u64)mask_hi << 32) | mask_lo, + fd, pathname); +} diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3ad421784ae7..cf4e2e381cba 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -80,4 +80,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *); /* ia32/ipc32.c */ asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); + +asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int, + const char __user *); #endif /* _ASM_X86_SYS_IA32_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 981c7e7ad804..80b799cd74f7 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -344,10 +344,11 @@ #define __NR_perf_event_open 336 #define __NR_recvmmsg 337 #define __NR_fanotify_init 338 +#define __NR_fanotify_mark 339 #ifdef __KERNEL__ -#define NR_syscalls 339 +#define NR_syscalls 340 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4f23e04bdb34..5b7b1d585616 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -665,6 +665,8 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open) __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __NR_fanotify_init 300 __SYSCALL(__NR_fanotify_init, sys_fanotify_init) +#define __NR_fanotify_mark 301 +__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index e38793b50e1d..07ad5eb7cc5c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -338,3 +338,4 @@ ENTRY(sys_call_table) .long sys_perf_event_open .long sys_recvmmsg .long sys_fanotify_init + .long sys_fanotify_mark -- cgit From d78d671db478eb8b14c78501c0cee1cc7baf6967 Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:30 +0200 Subject: x86, cpu: AMD errata checking framework Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that have an OSVW id assigned, which it takes as first argument. Both take a variable number of family-specific model-stepping ranges created by AMD_MODEL_RANGE(). Iff an erratum has an OSVW id, OSVW is available on the CPU, and the OSVW id is known to the hardware, it is used to determine whether an erratum is present. Otherwise, the model-stepping ranges are matched against the current CPU to find out whether the erratum applies. For certain special errata, the code using this framework might have to conduct further checks to make sure an erratum is really (not) present. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 18 ++++++++++++ arch/x86/kernel/cpu/amd.c | 60 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7e5c6a60b8ee..5084c2f5ac20 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1025,4 +1025,22 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, return ratio; } +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12b9cff047c1..80665410b064 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -609,3 +609,63 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 8) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} -- cgit From 9d8888c2a214aece2494a49e699a097c2ba9498b Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:31 +0200 Subject: x86, cpu: Clean up AMD erratum 400 workaround Remove check_c1e_idle() and use the new AMD errata checking framework instead. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-2-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 5 +++++ arch/x86/kernel/process.c | 39 ++------------------------------------- 3 files changed, 8 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5084c2f5ac20..eebdc1fde3d1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, * AMD errata checking */ #ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_400[]; extern bool cpu_has_amd_erratum(const int *); #define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 80665410b064..a62a4ae7a11a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -628,6 +628,11 @@ cpu_dev_register(amd_cpu_dev); * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); */ +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + + bool cpu_has_amd_erratum(const int *erratum) { struct cpuinfo_x86 *cpu = ¤t_cpu_data; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..553b02f13094 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} - static cpumask_var_t c1e_mask; static int c1e_detected; @@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else -- cgit From 1be85a6d93f4207d8c2c6238c4a96895e28cefba Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:32 +0200 Subject: x86, cpu: Use AMD errata checking framework for erratum 383 Use the AMD errata checking framework instead of open-coding the test. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-3-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/amd.c | 2 ++ arch/x86/kvm/svm.c | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eebdc1fde3d1..d85637bb9505 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1029,6 +1029,7 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, * AMD errata checking */ #ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_383[]; extern const int amd_erratum_400[]; extern bool cpu_has_amd_erratum(const int *); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a62a4ae7a11a..30f30dcbdb82 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,6 +632,8 @@ const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); bool cpu_has_amd_erratum(const int *erratum) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..03b534b34ee9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -383,8 +383,7 @@ static void svm_init_erratum_383(void) int err; u64 val; - /* Only Fam10h is affected */ - if (boot_cpu_data.x86 != 0x10) + if (!cpu_has_amd_erratum(amd_erratum_383)) return; /* Use _safe variants to not break nested virtualization */ -- cgit From 4532b305e8f0c238dd73048068ff8a6dd1380291 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 15:18:35 -0700 Subject: x86, asm: Clean up and simplify Remove the __xg() hack to create a memory barrier near xchg and cmpxchg; it has been there since 1.3.11 but should not be necessary with "asm volatile" and a "memory" clobber, neither of which were there in the original implementation. However, we *should* make this a volatile reference. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 75 ++++++++++++++++++++++----------------- arch/x86/include/asm/cmpxchg_64.h | 61 +++++++++++++++++++++---------- 2 files changed, 84 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 20955ea7bc12..f5bd1fd388ff 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -11,38 +11,42 @@ extern void __xchg_wrong_size(void); /* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. */ - -struct __xchg_dummy { - unsigned long a[100]; -}; -#define __xg(x) ((struct __xchg_dummy *)(x)) - #define __xchg(x, ptr, size) \ ({ \ __typeof(*(ptr)) __x = (x); \ switch (size) { \ case 1: \ - asm volatile("xchgb %b0,%1" \ - : "=q" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile("xchgw %w0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 4: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ default: \ __xchg_wrong_size(); \ } \ @@ -94,23 +98,32 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile(lock "cmpxchgw %w2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 4: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ default: \ __cmpxchg_wrong_size(); \ } \ @@ -148,31 +161,27 @@ extern void __cmpxchg_wrong_size(void); (unsigned long long)(n))) #endif -static inline unsigned long long __cmpxchg64(volatile void *ptr, - unsigned long long old, - unsigned long long new) +static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - unsigned long long prev; + u64 prev; asm volatile(LOCK_PREFIX "cmpxchg8b %1" : "=A" (prev), - "+m" (*__xg(ptr)) - : "b" ((unsigned long)new), - "c" ((unsigned long)(new >> 32)), + "+m" (*ptr) + : "b" ((u32)new), + "c" ((u32)(new >> 32)), "0" (old) : "memory"); return prev; } -static inline unsigned long long __cmpxchg64_local(volatile void *ptr, - unsigned long long old, - unsigned long long new) +static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - unsigned long long prev; + u64 prev; asm volatile("cmpxchg8b %1" : "=A" (prev), - "+m" (*__xg(ptr)) - : "b" ((unsigned long)new), - "c" ((unsigned long)(new >> 32)), + "+m" (*ptr) + : "b" ((u32)new), + "c" ((u32)(new >> 32)), "0" (old) : "memory"); return prev; diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 9596e7c61960..423ae58aa020 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -3,8 +3,6 @@ #include /* Provides LOCK_PREFIX */ -#define __xg(x) ((volatile long *)(x)) - static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; @@ -14,38 +12,51 @@ extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); /* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. */ #define __xchg(x, ptr, size) \ ({ \ __typeof(*(ptr)) __x = (x); \ switch (size) { \ case 1: \ - asm volatile("xchgb %b0,%1" \ - : "=q" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile("xchgw %w0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 4: \ - asm volatile("xchgl %k0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ case 8: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__xg(ptr)) \ + : "=r" (__x), "+m" (*__ptr) \ : "0" (__x) \ : "memory"); \ break; \ + } \ default: \ __xchg_wrong_size(); \ } \ @@ -69,29 +80,41 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "q" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 2: \ - asm volatile(lock "cmpxchgw %w2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 4: \ - asm volatile(lock "cmpxchgl %k2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ case 8: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ asm volatile(lock "cmpxchgq %2,%1" \ - : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "=a" (__ret), "+m" (*__ptr) \ : "r" (__new), "0" (__old) \ : "memory"); \ break; \ + } \ default: \ __cmpxchg_wrong_size(); \ } \ -- cgit From a5b91606bdc9d0a0d036d2d829a22921c705573e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:23:20 -0700 Subject: x86, cpu: Export AMD errata definitions Exprot the AMD errata definitions, since they are needed by kvm_amd.ko if that is built as a module. Doing "make allmodconfig" during testing would have caught this. Signed-off-by: H. Peter Anvin Cc: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> --- arch/x86/kernel/cpu/amd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 30f30dcbdb82..60a57b13082d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,9 +631,11 @@ cpu_dev_register(amd_cpu_dev); const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_400); const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_383); bool cpu_has_amd_erratum(const int *erratum) { @@ -676,3 +678,5 @@ bool cpu_has_amd_erratum(const int *erratum) return false; } + +EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); -- cgit From 90c8f92f5c807807ca74d5f2f313794925174e6b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:53:49 -0700 Subject: x86, asm: Move cmpxchg emulation code to arch/x86/lib Move cmpxchg emulation code from arch/x86/kernel/cpu (which is otherwise CPU identification) to arch/x86/lib, where other emulation code lives already. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/cmpxchg.c | 72 ------------------------------------------- arch/x86/lib/Makefile | 1 + arch/x86/lib/cmpxchg.c | 72 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 73 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cmpxchg.c create mode 100644 arch/x86/lib/cmpxchg.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..c47c43914ba7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,7 +16,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o -obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572cc..000000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include -#include -#include - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f871e04b6965..e10cf070ede0 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -30,6 +30,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += semaphore_32.o string_32.o + lib-y += cmpxchg.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c new file mode 100644 index 000000000000..2056ccf572cc --- /dev/null +++ b/arch/x86/lib/cmpxchg.c @@ -0,0 +1,72 @@ +/* + * cmpxchg*() fallbacks for CPU not supporting these instructions + */ + +#include +#include +#include + +#ifndef CONFIG_X86_CMPXCHG +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) +{ + u8 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u8 *)ptr; + if (prev == old) + *(u8 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u8); + +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) +{ + u16 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u16 *)ptr; + if (prev == old) + *(u16 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u16); + +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) +{ + u32 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u32 *)ptr; + if (prev == old) + *(u32 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u32); +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + -- cgit From a378d9338e8dde78314b3a6ae003de351936c729 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 17:05:11 -0700 Subject: x86, asm: Merge cmpxchg_486_u64() and cmpxchg8b_emu() We have two functions for doing exactly the same thing -- emulating cmpxchg8b on 486 and older hardware -- with different calling conventions, and yet doing the same thing. Drop the C version and use the assembly version, via alternatives, for both the local and non-local versions of cmpxchg8b. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/include/asm/cmpxchg_32.h | 30 ++++++++++++++---------------- arch/x86/lib/cmpxchg.c | 18 ------------------ 2 files changed, 14 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f5bd1fd388ff..284a6e8f7ce1 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -246,8 +246,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); - #define cmpxchg64(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ @@ -265,20 +263,20 @@ extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); __ret; }) - -#define cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 4)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - __ret; \ -}) +#define cmpxchg64_local(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (o); \ + __typeof__(*(ptr)) __new = (n); \ + alternative_io("call cmpxchg8b_emu", \ + "cmpxchg8b (%%esi)" , \ + X86_FEATURE_CX8, \ + "=A" (__ret), \ + "S" ((ptr)), "0" (__old), \ + "b" ((unsigned int)__new), \ + "c" ((unsigned int)(__new>>32)) \ + : "memory"); \ + __ret; }) #endif diff --git a/arch/x86/lib/cmpxchg.c b/arch/x86/lib/cmpxchg.c index 2056ccf572cc..5d619f6df3ee 100644 --- a/arch/x86/lib/cmpxchg.c +++ b/arch/x86/lib/cmpxchg.c @@ -52,21 +52,3 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) } EXPORT_SYMBOL(cmpxchg_386_u32); #endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - -- cgit From 3709c857350976408953831f0cf89d19951394a1 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Thu, 22 Jul 2010 14:57:35 +0900 Subject: x86: Ioremap: fix wrong physical address handling in PAT code The following two commits fixed a problem that x86 ioremap() doesn't handle physical address higher than 32-bit properly in X86_32 PAE mode. ffa71f33a820d1ab3f2fc5723819ac60fb76080b (x86, ioremap: Fix incorrect physical address handling in PAE mode) 35be1b716a475717611b2dc04185e9d80b9cb693 (x86, ioremap: Fix normal ram range check) But these fixes are not enough, since pat_pagerange_is_ram() in PAT code also has a same problem. This patch fixes it. Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Kenji Kaneshige LKML-Reference: <4C47DDCF.80300@jp.fujitsu.com> Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index acc15b23b743..03b48c80c651 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -158,7 +158,7 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } -static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ram_page = 0, not_rampage = 0; unsigned long page_nr; -- cgit From ca65f9fc0c447da5b270b05c41c21b19c88617c3 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Jul 2010 14:37:48 +0100 Subject: Introduce CONFIG_XEN_PVHVM compile option This patch introduce a CONFIG_XEN_PVHVM compile time option to enable/disable Xen PV on HVM support. Signed-off-by: Stefano Stabellini --- arch/x86/kernel/cpu/hypervisor.c | 2 +- arch/x86/xen/Kconfig | 5 +++++ arch/x86/xen/enlighten.c | 2 ++ arch/x86/xen/mmu.c | 2 ++ arch/x86/xen/platform-pci-unplug.c | 2 ++ arch/x86/xen/time.c | 3 ++- 6 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 5bccedcb9121..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PVHVM &x86_hyper_xen_hvm, #endif }; diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index b83e119fbeb0..68128a1b401a 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -13,6 +13,11 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. +config XEN_PVHVM + def_bool y + depends on XEN + depends on X86_LOCAL_APIC + config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" default 8 if X86_32 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 75b479a684fd..6f5345378abc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1282,6 +1282,7 @@ void xen_hvm_init_shared_info(void) } } +#ifdef CONFIG_XEN_PVHVM static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1338,3 +1339,4 @@ const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { .init_platform = xen_hvm_guest_init, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); +#endif diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 84648c1bf138..413b19b3d0fe 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1942,6 +1942,7 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; } +#ifdef CONFIG_XEN_PVHVM static void xen_hvm_exit_mmap(struct mm_struct *mm) { struct xen_hvm_pagetable_dying a; @@ -1973,6 +1974,7 @@ void __init xen_hvm_init_mmu_ops(void) if (is_pagetable_dying_supported()) pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; } +#endif #ifdef CONFIG_XEN_DEBUG_FS diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 2f7f3fb34777..554c002a1e1a 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -32,6 +32,7 @@ /* store the value of xen_emul_unplug after the unplug is done */ int xen_platform_pci_unplug; EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +#ifdef CONFIG_XEN_PVHVM static int xen_emul_unplug; static int __init check_platform_magic(void) @@ -133,3 +134,4 @@ static int __init parse_xen_emul_unplug(char *arg) return 0; } early_param("xen_emul_unplug", parse_xen_emul_unplug); +#endif diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 4780e55886a5..2aab4a2b9100 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -516,6 +516,7 @@ __init void xen_init_time_ops(void) x86_platform.set_wallclock = xen_set_wallclock; } +#ifdef CONFIG_XEN_PVHVM static void xen_hvm_setup_cpu_clockevents(void) { int cpu = smp_processor_id(); @@ -544,4 +545,4 @@ __init void xen_hvm_init_time_ops(void) x86_platform.get_wallclock = xen_get_wallclock; x86_platform.set_wallclock = xen_set_wallclock; } - +#endif -- cgit From 73cd3b43f08cc9a9bcb168994b8e9ebd983ff573 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 15 Jun 2010 15:43:19 +0200 Subject: x86/PCI: pci, fix section mismatch pcibios_scan_specific_bus calls pci_scan_bus_on_node which is __devinit. Mark pcibios_scan_specific_bus __devinit as well since all users are now __init or __devinit. Signed-off-by: Jiri Slaby Signed-off-by: Jesse Barnes --- arch/x86/pci/legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 8d460eaf524f..c89266be6048 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -36,7 +36,7 @@ int __init pci_legacy_init(void) return 0; } -void pcibios_scan_specific_bus(int busn) +void __devinit pcibios_scan_specific_bus(int busn) { int devfn; long node; -- cgit From 7bd1c365fd124624191d49dcc1eb9759d6017ec3 Mon Sep 17 00:00:00 2001 From: Mike Habeck Date: Wed, 12 May 2010 11:14:32 -0700 Subject: x86/PCI: Add option to not assign BAR's if not already assigned The Linux kernel assigns BARs that a BIOS did not assign, most likely to handle broken BIOSes that didn't enumerate the devices correctly. On UV the BIOS purposely doesn't assign I/O BARs for certain devices/ drivers we know don't use them (examples, LSI SAS, Qlogic FC, ...). We purposely don't assign these I/O BARs because I/O Space is a very limited resource. There is only 64k of I/O Space, and in a PCIe topology that space gets divided up into 4k chucks (this is due to the fact that a pci-to-pci bridge's I/O decoder is aligned at 4k)... Thus a system can have at most 16 cards with I/O BARs: (64k / 4k = 16) SGI needs to scale to >16 devices with I/O BARs. So by not assigning I/O BARs on devices we know don't use them, we can do that (iff the kernel doesn't go and assign these BARs that the BIOS purposely didn't assign). This patch will not assign a resource to a device BAR if that BAR was not assigned by the BIOS, and the kernel cmdline option 'pci=nobar' was specified. This patch is closely modeled after the 'pci=norom' option that currently exists in the tree. Signed-off-by: Mike Habeck Signed-off-by: Mike Travis Signed-off-by: Jesse Barnes --- arch/x86/include/asm/pci_x86.h | 1 + arch/x86/pci/common.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index cd2a31dc5fb8..49c7219826f9 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -30,6 +30,7 @@ #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 #define PCI_ROOT_NO_CRS 0x100000 +#define PCI_NOASSIGN_BARS 0x200000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 215a27ae050d..a0772af64efb 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -125,6 +125,23 @@ void __init dmi_check_skip_isa_align(void) static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) { struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + struct resource *bar_r; + int bar; + + if (pci_probe & PCI_NOASSIGN_BARS) { + /* + * If the BIOS did not assign the BAR, zero out the + * resource so the kernel doesn't attmept to assign + * it later on in pci_assign_unassigned_resources + */ + for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { + bar_r = &dev->resource[bar]; + if (bar_r->start == 0 && bar_r->end != 0) { + bar_r->flags = 0; + bar_r->end = 0; + } + } + } if (pci_probe & PCI_NOASSIGN_ROMS) { if (rom_r->parent) @@ -509,6 +526,9 @@ char * __devinit pcibios_setup(char *str) } else if (!strcmp(str, "norom")) { pci_probe |= PCI_NOASSIGN_ROMS; return NULL; + } else if (!strcmp(str, "nobar")) { + pci_probe |= PCI_NOASSIGN_BARS; + return NULL; } else if (!strcmp(str, "assign-busses")) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; return NULL; -- cgit From 2491762cfb475dbdfa3db11ebea6de49f58b7fac Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 23 Jul 2010 12:53:27 -0600 Subject: x86/PCI: use host bridge _CRS info on ASRock ALiveSATA2-GLAN This DMI quirk turns on "pci=use_crs" for the ALiveSATA2-GLAN because amd_bus.c doesn't handle this system correctly. The system has a single HyperTransport I/O chain, but has two PCI host bridges to buses 00 and 80. amd_bus.c learns the MMIO range associated with buses 00-ff and that this range is routed to the HT chain hosted at node 0, link 0: bus: [00, ff] on node 0 link 0 bus: 00 index 1 [mem 0x80000000-0xfcffffffff] This includes the address space for both bus 00 and bus 80, and amd_bus.c assumes it's all routed to bus 00. We find device 80:01.0, which BIOS left in the middle of that space, but we don't find a bridge from bus 00 to bus 80, so we conclude that 80:01.0 is unreachable from bus 00, and we move it from the original, working, address to something outside the bus 00 aperture, which does not work: pci 0000:80:01.0: reg 10: [mem 0xfebfc000-0xfebfffff 64bit] pci 0000:80:01.0: BAR 0: assigned [mem 0xfd00000000-0xfd00003fff 64bit] The BIOS told us everything we need to know to handle this correctly, so we're better off if we just pay attention, which lets us leave the 80:01.0 device at the original, working, address: ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-7f]) pci_root PNP0A03:00: host bridge window [mem 0x80000000-0xff37ffff] ACPI: PCI Root Bridge [PCI1] (domain 0000 [bus 80-ff]) pci_root PNP0A08:00: host bridge window [mem 0xfebfc000-0xfebfffff] This was a regression between 2.6.33 and 2.6.34. In 2.6.33, amd_bus.c was used only when we found multiple HT chains. 3e3da00c01d050, which enabled amd_bus.c even on systems with a single HT chain, caused this failure. This quirk was written by Graham. If we ever enable "pci=use_crs" for machines from 2006 or earlir, this quirk should be removed. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=16007 Cc: stable@kernel.org Reported-by: Graham Ramsey Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/acpi.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2ec04c424a62..15466c096ba5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */ + /* 2006 AMD HT/VIA system with two host bridges */ + { + .callback = set_use_crs, + .ident = "ASRock ALiveSATA2-GLAN", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), + }, + }, {} }; -- cgit From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 23 Jul 2010 14:56:28 +0100 Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc() commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove unsafe and unnecessary hardware access" changed read_msi_msg_desc() to return the last MSI message written instead of reading it from the device, since it may be called while the device is in a reduced power state. However, the pSeries platform code really does need to read messages from the device, since they are initially written by firmware. Therefore: - Restore the previous behaviour of read_msi_msg_desc() - Add new functions get_cached_msi_msg{,_desc}() which return the last MSI message written - Use the new functions where appropriate Acked-by: Michael Ellerman Signed-off-by: Ben Hutchings Signed-off-by: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26d..4dc0084ec1b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); -- cgit From 1f7979ac53224b0208e7d3eaeb5fd72ab9687389 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 20:04:03 +0400 Subject: x86/PCI: use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 9810a0f76c91..f547ee05f715 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -989,7 +989,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin - 1, irq); /* Update IRQ for all devices with the same pirq value */ - while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { + for_each_pci_dev(dev2) { pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; @@ -1028,7 +1028,7 @@ void __init pcibios_fixup_irqs(void) u8 pin; DBG(KERN_DEBUG "PCI: IRQ fixup\n"); - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* * If the BIOS has set an out of range IRQ number, just * ignore it. Also keep track of which IRQ's are @@ -1052,7 +1052,7 @@ void __init pcibios_fixup_irqs(void) return; dev = NULL; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) continue; -- cgit From 68f202e4e87cfab4439568bf397fcc5c7cf8d729 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 30 Jul 2010 11:46:42 -0700 Subject: x86, mtrr: Use stop machine context to rendezvous all the cpu's Use the stop machine context rather than IPI's to rendezvous all the cpus for MTRR initialization that happens during cpu bringup or for MTRR modifications during runtime. This avoids deadlock scenario (reported by Prarit) like: cpu A holds a read_lock (tasklist_lock for example) with irqs enabled cpu B waits for the same lock with irqs disabled using write_lock_irq cpu C doing set_mtrr() (during AP bringup for example), which will try to rendezvous all the cpus using IPI's This will result in C and A come to the rendezvous point and waiting for B. B is stuck forever waiting for the lock and thus not reaching the rendezvous point. Using stop cpu (run in the process context of per cpu based keventd) to do this rendezvous, avoids this deadlock scenario. Also make sure all the cpu's are in the rendezvous handler before we proceed with the local_irq_save() on each cpu. This lock step disabling irqs on all the cpus will avoid other deadlock scenarios (for example involving with the blocking smp_call_function's etc). [ This problem is very old. Marking -stable only for 2.6.35 as the stop_one_cpu_nowait() API is present only in 2.6.35. Any older kernel interested in this fix need to do some more work in backporting this patch. ] Reported-by: Prarit Bhargava Signed-off-by: Suresh Siddha LKML-Reference: <1280515602.2682.10.camel@sbsiddha-MOBL3.sc.intel.com> Acked-by: Prarit Bhargava Cc: stable@kernel.org [2.6.35] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 56 +++++++++++++++++++++++++++++++---------- arch/x86/kernel/smpboot.c | 7 ++++++ 2 files changed, 50 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -35,6 +35,7 @@ #include /* FIXME: kvm_para.h needs this */ +#include #include #include #include @@ -143,22 +144,28 @@ struct set_mtrr_data { mtrr_type smp_type; }; +static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); + /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_work_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; + atomic_dec(&data->count); + while (!atomic_read(&data->gate)) + cpu_relax(); + local_irq_save(flags); atomic_dec(&data->count); - while (!atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ @@ -173,12 +180,13 @@ static void ipi_handler(void *info) } atomic_dec(&data->count); - while (atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * of CPUs. As each CPU announces that it started the rendezvous handler by + * decrementing the count, We reset data.count and set the data.gate flag + * allowing all the cpu's to proceed with the work. As each cpu disables + * interrupts, it'll decrement data.count once. We wait until it hits 0 and + * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they + * are waiting for that flag to be cleared. Once it's cleared, each * CPU goes through the transition of updating MTRRs. * The CPU vendors may each do it differently, * so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate - * to be reset. + * to be set. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * @@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ { struct set_mtrr_data data; unsigned long flags; + int cpu; + + preempt_disable(); data.smp_reg = reg; data.smp_base = base; @@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.gate, 0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + for_each_online_cpu(cpu) { + struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + + if (cpu == smp_processor_id()) + continue; + + stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); + } - local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); @@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ smp_wmb(); atomic_set(&data.gate, 1); + local_irq_save(flags); + + while (atomic_read(&data.count)) + cpu_relax(); + + /* Ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + smp_wmb(); + atomic_set(&data.gate, 0); + /* Do our MTRR business */ /* @@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate, 0); + atomic_set(&data.gate, 1); /* * Wait here for everyone to have seen the gate change @@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ cpu_relax(); local_irq_restore(flags); + preempt_enable(); } /** diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..11015fd1abbc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,13 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) -- cgit From 9792db6174d9927700ed288e6d74b9391bf785d1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:42 -0700 Subject: x86, cpu: Package Level Thermal Control, Power Limit Notification definitions Add package level thermal and power limit feature support. The two MSRs and features are new starting with Intel's Sandy Bridge processor. Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification and 14.6 Package Level Thermal Management. This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and THERM_INT_HIGH_ENABLE bit. [ hpa: fixed up against current tip:x86/cpu ] Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-2-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 2 ++ arch/x86/include/asm/msr-index.h | 17 +++++++++++++++-- arch/x86/kernel/cpu/scattered.c | 2 ++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 4be50ddd4d79..817aa316b180 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -166,6 +166,8 @@ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ +#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7cc4a026331c..4ea2a7ca7a4b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -224,12 +224,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b -#define THERM_INT_LOW_ENABLE (1 << 0) -#define THERM_INT_HIGH_ENABLE (1 << 1) +#define THERM_INT_HIGH_ENABLE (1 << 0) +#define THERM_INT_LOW_ENABLE (1 << 1) +#define THERM_INT_PLN_ENABLE (1 << 24) #define MSR_IA32_THERM_STATUS 0x0000019c #define THERM_STATUS_PROCHOT (1 << 0) +#define THERM_STATUS_POWER_LIMIT (1 << 10) #define MSR_THERM2_CTL 0x0000019d @@ -241,6 +243,17 @@ #define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 + +#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) +#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) + +#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 + +#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) +#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) +#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 9815364b477e..34b4dad6f0b8 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -33,6 +33,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, -- cgit From 25971865d48a8d0ece5307a59dbd3f06d05a7567 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 16 Jun 2010 23:19:28 -0400 Subject: x86, olpc: Use pr_debug() for EC commands Unconditionally printing EC debug messages was helpful when we were actually debugging the EC, but during normal operation it can get pretty annoying. Using pr_debug allows us finer-grained control. Signed-off-by: Andres Salomon LKML-Reference: <20100616231928.16b539f0@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/olpc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 156605281f56..f5ff3903b38b 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -142,7 +142,7 @@ restart: * The OBF flag will sometimes misbehave due to what we believe * is a hardware quirk.. */ - printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); + pr_devel("olpc-ec: running cmd 0x%x\n", cmd); outb(cmd, 0x6c); if (wait_on_ibf(0x6c, 0)) { @@ -159,8 +159,7 @@ restart: " EC accept data!\n"); goto err; } - printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", - inbuf[i]); + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); outb(inbuf[i], 0x68); } } @@ -173,8 +172,7 @@ restart: goto restart; } outbuf[i] = inb(0x68); - printk(KERN_DEBUG "olpc-ec: received 0x%x\n", - outbuf[i]); + pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); } } -- cgit From 54e5bc020ce1c959eaa7be18cedb734b6b13745e Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 28 Jun 2010 22:00:29 -0400 Subject: x86, olpc: Constify an olpc_ofw() arg The arguments passed to OFW shouldn't be modified; update the 'args' argument of olpc_ofw to reflect this. This saves us some later casting away of consts. Signed-off-by: Andres Salomon LKML-Reference: <20100628220029.1555ac24@debian> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/olpc_ofw.h | 2 +- arch/x86/kernel/olpc.c | 2 +- arch/x86/kernel/olpc_ofw.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h index 3e63d857c48f..08fde475cb3b 100644 --- a/arch/x86/include/asm/olpc_ofw.h +++ b/arch/x86/include/asm/olpc_ofw.h @@ -12,7 +12,7 @@ #define olpc_ofw(name, args, res) \ __olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res) -extern int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, +extern int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, void **res); /* determine whether OFW is available and lives in the proper memory */ diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index f5ff3903b38b..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -188,7 +188,7 @@ static void __init platform_detect(void) { size_t propsize; __be32 rev; - void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; void *res[] = { &propsize }; if (olpc_ofw("getprop", args, res) || propsize != 4) { diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index f5d499fbe74f..3218aa71ab5e 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -40,7 +40,7 @@ void __init setup_olpc_ofw_pgd(void) early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, +int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, void **res) { int ofw_args[MAXARGS + 3]; -- cgit From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 31 May 2010 14:28:19 +0800 Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages In common cases, guest SRAO MCE will cause corresponding poisoned page be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay the MCE to guest OS. But it is reported that if the poisoned page is accessed in guest after unmapping and before MCE is relayed to guest OS, userspace will be killed. The reason is as follows. Because poisoned page has been un-mapped, guest access will cause guest exit and kvm_mmu_page_fault will be called. kvm_mmu_page_fault can not get the poisoned page for fault address, so kernel and user space MMIO processing is tried in turn. In user MMIO processing, poisoned page is accessed again, then userspace is killed by force_sig_info. To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM and do not try kernel and user space MMIO processing for poisoned page. [xiao: fix warning introduced by avi] Reported-by: Max Asbock Signed-off-by: Huang Ying Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++-------- arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 28 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..b666d8d106a9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return pt_write; } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ + char buf[1]; + void __user *hva; + int r; + + /* Touch the page, so send SIGBUS */ + hva = (void __user *)gfn_to_hva(kvm, gfn); + r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(kvm, gfn); + return 0; + } + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b549..c7f27779c998 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -431,11 +431,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) { - pgprintk("gfn %lx is mmio\n", walker.gfn); - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) -- cgit From c332c83ae736c72dcf072e96e98a774fce39e722 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 4 May 2010 12:24:12 +0300 Subject: KVM: VMX: Simplify vmx_get_nmi_mask() !! is not needed due to the cast to bool. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..64252075796a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2826,9 +2826,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; - else - return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - GUEST_INTR_STATE_NMI); + return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) -- cgit From 914ebccd2d8fa439e01fe93b5229534b9e179a69 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 28 Apr 2010 18:50:36 +0900 Subject: KVM: x86: avoid unnecessary bitmap allocation when memslot is clean Although we always allocate a new dirty bitmap in x86's get_dirty_log(), it is only used as a zero-source of copy_to_user() and freed right after that when memslot is clean. This patch uses clear_user() instead of doing this unnecessary zero-source allocation. Performance improvement: as we can expect easily, the time needed to allocate a bitmap is completely reduced. In my test, the improved ioctl was about 4 to 10 times faster than the original one for clean slots. Furthermore, reducing memory allocations and copies will produce good effects to caches too. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..1b270fd60634 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2797,7 +2797,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; - unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -2812,27 +2811,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; - memset(dirty_bitmap, 0, n); - for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; + unsigned long *dirty_bitmap; spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - goto out_free; + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) { + vfree(dirty_bitmap); + goto out; + } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; @@ -2841,13 +2843,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { + vfree(dirty_bitmap); + goto out; + } + vfree(dirty_bitmap); + } else { + r = -EFAULT; + if (clear_user(log->dirty_bitmap, n)) + goto out; } r = 0; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - r = -EFAULT; -out_free: - vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; -- cgit From 08acfa187117046f8b5044b4a4cdc910f3ceeeb5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 4 May 2010 13:00:55 +0300 Subject: KVM: kvm_pdptr_read() may sleep Annotate it thusly. Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cff851cf5322..d2a98f8f9af5 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { + might_sleep(); /* on svm */ + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); -- cgit From 1c11e713576edf33b95669be9c2dc0ff1e0c90d3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 16:05:44 +0300 Subject: KVM: VMX: Avoid writing HOST_CR0 every entry cr0.ts may change between entries, so we copy cr0 to HOST_CR0 before each entry. That is slow, so instead, set HOST_CR0 to have TS set unconditionally (which is a safe value), and issue a clts() just before exiting vcpu context if the task indeed owns the fpu. Saves ~50 cycles/exit. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 +++------ arch/x86/kvm/x86.c | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64252075796a..598931734251 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -812,6 +812,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); } #endif + if (current_thread_info()->status & TS_USEDFPU) + clts(); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2507,7 +2509,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ @@ -3859,11 +3861,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b270fd60634..801afc6461ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1731,8 +1731,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_put_guest_fpu(vcpu); kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); } static int is_efer_nx(void) -- cgit From 9de41573675cbace09b02ef386f3e9c8739d495c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:22 +0300 Subject: KVM: x86 emulator: introduce read cache Introduce read cache which is needed for instruction that require more then one exit to userspace. After returning from userspace the instruction will be re-executed with cached read value. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 56 ++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0b2729bf2070..288cbedcab1c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -186,6 +186,7 @@ struct decode_cache { unsigned long modrm_val; struct fetch_cache fetch; struct read_cache io_read; + struct read_cache mem_read; }; struct x86_emulate_ctxt { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed6..776874b8e50e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1263,6 +1263,33 @@ done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static int read_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long addr, void *dest, unsigned size) +{ + int rc; + struct read_cache *mc = &ctxt->decode.mem_read; + + while (size) { + int n = min(size, 8u); + size -= n; + if (mc->pos < mc->end) + goto read_cached; + + rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; + mc->end += n; + + read_cached: + memcpy(dest, mc->data + mc->pos, n); + mc->pos += n; + dest += n; + addr += n; + } + return X86EMUL_CONTINUE; +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -1504,9 +1531,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_emulated(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - dest, len, ctxt->vcpu); + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -2475,6 +2502,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int saved_dst_type = c->dst.type; ctxt->interruptibility = 0; + ctxt->decode.mem_read.pos = 0; /* Shadow copy of register state. Committed on successful emulation. * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't @@ -2529,20 +2557,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->src.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + &c->src.val, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } if (c->src2.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src2.ptr, - &c->src2.val, - c->src2.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2553,8 +2577,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, - c->dst.bytes, ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2981,7 +3005,11 @@ writeback: (rc->end != 0 && rc->end == rc->pos)) ctxt->restart = false; } - + /* + * reset read cache here in case string instruction is restared + * without decoding + */ + ctxt->decode.mem_read.end = 0; /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(ctxt->vcpu, c->eip); -- cgit From 054fe9f6e3b76877516b37ac7d83d58c7f37c1b6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:23 +0300 Subject: KVM: x86 emulator: fix Move r/m16 to segment register decoding This instruction does not need generic decoding for its dst operand. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 776874b8e50e..a81e6bfcade7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -171,7 +171,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - DstReg | SrcMem | ModRM | Mov, Group | Group1A, + ImplicitOps | SrcMem | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ -- cgit From f0c13ef1a8f31be08bf1b1244fe4565f11f4b009 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:24 +0300 Subject: KVM: x86 emulator: cleanup xchg emulation Dst operand is already initialized during decoding stage. No need to reinitialize. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a81e6bfcade7..a99d49cc8934 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2804,8 +2804,8 @@ special_insn: break; } case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = c->dst.type = OP_REG; - c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.type = OP_REG; + c->src.bytes = c->op_bytes; c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; c->src.val = *(c->src.ptr); goto xchg; -- cgit From b8a98945ea5b735e083eaf92906aa0ff9ece92e8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:25 +0300 Subject: KVM: x86 emulator: cleanup nop emulation Make it more explicit what we are checking for. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a99d49cc8934..03a72912d7b9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2799,8 +2799,8 @@ special_insn: goto done; break; case 0x90: /* nop / xchg r8,rax */ - if (!(c->rex_prefix & 1)) { /* nop */ - c->dst.type = OP_NONE; + if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { + c->dst.type = OP_NONE; /* nop */ break; } case 0x91 ... 0x97: /* xchg reg,rax */ -- cgit From 414e6277fd148f6470261cef50a7fed0d88a2825 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:26 +0300 Subject: KVM: x86 emulator: handle "far address" source operand ljmp/lcall instruction operand contains address and segment. It can be 10 bytes long. Currently we decode it as two different operands. Fix it by introducing new kind of operand that can hold entire far address. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 6 +++- arch/x86/kvm/emulate.c | 56 ++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 288cbedcab1c..69a64a6a36f4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -143,7 +143,11 @@ struct x86_emulate_ops { struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; - unsigned long val, orig_val, *ptr; + unsigned long orig_val, *ptr; + union { + unsigned long val; + char valptr[sizeof(unsigned long) + 2]; + }; }; struct fetch_cache { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 03a72912d7b9..687ea0906b79 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -67,6 +67,8 @@ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ +#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ +#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -88,10 +90,6 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) -#define Src2Imm16 (4<<29) -#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be - in memory and second argument is located - immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { @@ -175,7 +173,7 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16 | No64, 0, + 0, 0, SrcImmFAddr | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -215,7 +213,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, + SrcImmFAddr | No64, SrcImmByte | ImplicitOps, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ @@ -350,7 +348,7 @@ static u32 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, + SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, @@ -576,6 +574,13 @@ static u32 group2_table[] = { (_type)_x; \ }) +#define insn_fetch_arr(_arr, _size, _eip) \ +({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ +}) + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -1160,6 +1165,17 @@ done_prefixes: c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.bytes = c->op_bytes + 2; + break; } /* @@ -1179,22 +1195,10 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; - case Src2Imm16: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 2; - c->src2.val = insn_fetch(u16, 2, c->eip); - break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; break; - case Src2Mem16: - c->src2.type = OP_MEM; - c->src2.bytes = 2; - c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); - c->src2.val = 0; - break; } /* Decode and fetch the destination operand: register or memory. */ @@ -2558,7 +2562,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->src.type == OP_MEM) { rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, - &c->src.val, c->src.bytes); + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; @@ -2884,14 +2888,18 @@ special_insn: } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ + case 0xea: { /* jmp far */ + unsigned short sel; jump_far: - if (load_segment_descriptor(ctxt, ops, c->src2.val, - VCPU_SREG_CS)) + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) goto done; - c->eip = c->src.val; + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); break; + } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); -- cgit From 35aa5375d407ecadcc3adb5cb31d27044bf7f29f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:27 +0300 Subject: KVM: x86 emulator: add (set|get)_dr callbacks to x86_emulate_ops Add (set|get)_dr callbacks to x86_emulate_ops instead of calling them directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/include/asm/kvm_host.h | 4 ---- arch/x86/kvm/emulate.c | 7 +++++-- arch/x86/kvm/x86.c | 12 ++++++------ 4 files changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 69a64a6a36f4..c37296d0e909 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -137,6 +137,8 @@ struct x86_emulate_ops { void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); + int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76f5483cffec..97774ae3c874 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -591,10 +591,6 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest); -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 687ea0906b79..8a4aa73ff1e4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3132,7 +3132,7 @@ twobyte_insn: kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ @@ -3145,7 +3145,10 @@ twobyte_insn: kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); + + ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), + ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 801afc6461ed..059d63de169b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3620,16 +3620,14 @@ int emulate_clts(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; } -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - return kvm_get_dr(ctxt->vcpu, dr, dest); + return kvm_get_dr(vcpu, dr, dest); } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - - return kvm_set_dr(ctxt->vcpu, dr, value & mask); + return kvm_set_dr(vcpu, dr, value); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3811,6 +3809,8 @@ static struct x86_emulate_ops emulate_ops = { .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, .set_rflags = emulator_set_rflags, + .get_dr = emulator_get_dr, + .set_dr = emulator_set_dr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit From 3fb1b5dbd397d16a855c97c3fb80fe6e9196ce7c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:28 +0300 Subject: KVM: x86 emulator: add (set|get)_msr callbacks to x86_emulate_ops Add (set|get)_msr callbacks to x86_emulate_ops instead of calling them directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 ++ arch/x86/kvm/emulate.c | 36 ++++++++++++++++++------------------ arch/x86/kvm/x86.c | 2 ++ 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c37296d0e909..f751657be732 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -139,6 +139,8 @@ struct x86_emulate_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a4aa73ff1e4..7c8ed560fd41 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1875,7 +1875,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, } static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -1890,7 +1890,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; cs.selector = (u16)(msr_data & 0xfffc); ss.selector = (u16)(msr_data + 8); @@ -1907,17 +1907,17 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - kvm_x86_ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); + ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1927,7 +1927,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) } static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) +emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -1949,7 +1949,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) setup_syscalls_segments(ctxt, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { @@ -1979,17 +1979,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; } static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) +emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; @@ -2012,7 +2012,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.dpl = 3; ss.dpl = 3; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs.selector = (u16)(msr_data + 16); @@ -3099,7 +3099,7 @@ twobyte_insn: } break; case 0x05: /* syscall */ - rc = emulate_syscall(ctxt); + rc = emulate_syscall(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else @@ -3155,7 +3155,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } @@ -3164,7 +3164,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } else { @@ -3175,14 +3175,14 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt); + rc = emulate_sysenter(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else goto writeback; break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt); + rc = emulate_sysexit(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 059d63de169b..e3a5455049b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3811,6 +3811,8 @@ static struct x86_emulate_ops emulate_ops = { .set_rflags = emulator_set_rflags, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, + .set_msr = kvm_set_msr, + .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) -- cgit From 5951c4423724759906b10a26aa6a8817c4afa615 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:29 +0300 Subject: KVM: x86 emulator: add get_cached_segment_base() callback to x86_emulate_ops On VMX it is expensive to call get_cached_descriptor() just to get segment base since multiple vmcs_reads are done instead of only one. Introduce new call back get_cached_segment_base() for efficiency. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 13 +------------ arch/x86/kvm/x86.c | 7 +++++++ 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index f751657be732..df53ba2294b6 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -132,6 +132,7 @@ struct x86_emulate_ops { int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); + unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7c8ed560fd41..8228778ace38 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2097,17 +2097,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } -static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - int seg) -{ - struct desc_struct desc; - if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) - return get_desc_base(&desc); - else - return ~0; -} - static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct tss_segment_16 *tss) @@ -2383,7 +2372,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, int ret; u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); ulong old_tss_base = - get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); + ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3a5455049b0..9a469df6011c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3714,6 +3714,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static unsigned long emulator_get_cached_segment_base(int seg, + struct kvm_vcpu *vcpu) +{ + return get_segment_base(vcpu, seg); +} + static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, struct kvm_vcpu *vcpu) { @@ -3804,6 +3810,7 @@ static struct x86_emulate_ops emulate_ops = { .set_cached_descriptor = emulator_set_cached_descriptor, .get_segment_selector = emulator_get_segment_selector, .set_segment_selector = emulator_set_segment_selector, + .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, -- cgit From 79168fd1a307ffee46ee03b7f8711559241738c7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:30 +0300 Subject: KVM: x86 emulator: cleanup some direct calls into kvm to use existing callbacks Use callbacks from x86_emulate_ops to access segments instead of calling into kvm directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 200 ++++++++++++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8228778ace38..f56ec486393e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -622,31 +622,35 @@ static void set_seg_override(struct decode_cache *c, int seg) c->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); + return ops->get_cached_segment_base(seg, ctxt->vcpu); } static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) return 0; - return seg_base(ctxt, c->seg_override); + return seg_base(ctxt, ops, c->seg_override); } -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +static unsigned long es_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - return seg_base(ctxt, VCPU_SREG_ES); + return seg_base(ctxt, ops, VCPU_SREG_ES); } -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - return seg_base(ctxt, VCPU_SREG_SS); + return seg_base(ctxt, ops, VCPU_SREG_SS); } static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, @@ -941,7 +945,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { @@ -1065,7 +1069,7 @@ done_prefixes: set_seg_override(c, VCPU_SREG_DS); if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, c); + c->modrm_ea += seg_override_base(ctxt, ops, c); if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; @@ -1161,7 +1165,7 @@ done_prefixes: c->src.type = OP_MEM; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, c), + register_address(c, seg_override_base(ctxt, ops, c), c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; @@ -1257,7 +1261,7 @@ done_prefixes: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt), + register_address(c, es_base(ctxt, ops), c->regs[VCPU_REGS_RDI]); c->dst.val = 0; break; @@ -1516,7 +1520,8 @@ exception: return X86EMUL_PROPAGATE_FAULT; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; @@ -1524,7 +1529,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), c->regs[VCPU_REGS_RSP]); } @@ -1535,7 +1540,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt), + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), c->regs[VCPU_REGS_RSP]), dest, len); if (rc != X86EMUL_CONTINUE) @@ -1588,15 +1593,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment segment; - kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - c->src.val = segment.selector; - emulate_push(ctxt); + emulate_push(ctxt, ops); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1614,7 +1618,8 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt) +static void emulate_pusha(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1624,7 +1629,7 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt) (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt); + emulate_push(ctxt, ops); ++reg; } } @@ -1726,14 +1731,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } return X86EMUL_CONTINUE; @@ -1847,39 +1852,40 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct kvm_segment *cs, struct kvm_segment *ss) + struct x86_emulate_ops *ops, struct desc_struct *cs, + struct desc_struct *ss) { - memset(cs, 0, sizeof(struct kvm_segment)); - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct kvm_segment)); + memset(cs, 0, sizeof(struct desc_struct)); + ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ - cs->base = 0; /* flat segment */ + set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ - cs->limit = 0xffffffff; /* 4GB limit */ + set_desc_limit(cs, 0xfffff); /* 4GB limit */ cs->type = 0x0b; /* Read, Execute, Accessed */ cs->s = 1; cs->dpl = 0; /* will be adjusted later */ - cs->present = 1; - cs->db = 1; + cs->p = 1; + cs->d = 1; - ss->unusable = 0; - ss->base = 0; /* flat segment */ - ss->limit = 0xffffffff; /* 4GB limit */ + set_desc_base(ss, 0); /* flat segment */ + set_desc_limit(ss, 0xfffff); /* 4GB limit */ ss->g = 1; /* 4kb granularity */ ss->s = 1; ss->type = 0x03; /* Read/Write, Accessed */ - ss->db = 1; /* 32bit stack segment */ + ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; - ss->present = 1; + ss->p = 1; } static int emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || @@ -1888,19 +1894,21 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; - cs.selector = (u16)(msr_data & 0xfffc); - ss.selector = (u16)(msr_data + 8); + cs_sel = (u16)(msr_data & 0xfffc); + ss_sel = (u16)(msr_data + 8); if (is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; if (is_long_mode(ctxt->vcpu)) { @@ -1930,8 +1938,9 @@ static int emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { @@ -1947,7 +1956,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { @@ -1966,18 +1975,20 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs.selector = (u16)msr_data; - cs.selector &= ~SELECTOR_RPL_MASK; - ss.selector = cs.selector + 8; - ss.selector &= ~SELECTOR_RPL_MASK; + cs_sel = (u16)msr_data; + cs_sel &= ~SELECTOR_RPL_MASK; + ss_sel = cs_sel + 8; + ss_sel &= ~SELECTOR_RPL_MASK; if (ctxt->mode == X86EMUL_MODE_PROT64 || is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; @@ -1992,9 +2003,10 @@ static int emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; int usermode; + u16 cs_sel, ss_sel; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || @@ -2003,7 +2015,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -2015,29 +2027,31 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: - cs.selector = (u16)(msr_data + 16); + cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = (u16)(msr_data + 24); + ss_sel = (u16)(msr_data + 24); break; case X86EMUL_MODE_PROT64: - cs.selector = (u16)(msr_data + 32); + cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = cs.selector + 8; - cs.db = 0; + ss_sel = cs_sel + 8; + cs.d = 0; cs.l = 1; break; } - cs.selector |= SELECTOR_RPL_MASK; - ss.selector |= SELECTOR_RPL_MASK; + cs_sel |= SELECTOR_RPL_MASK; + ss_sel |= SELECTOR_RPL_MASK; - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; @@ -2061,25 +2075,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - struct kvm_segment tr_seg; + struct desc_struct tr_seg; int r; u16 io_bitmap_ptr; u8 perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); - if (tr_seg.unusable) + ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + if (!tr_seg.p) return false; - if (tr_seg.limit < 103) + if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, - NULL); + r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, + ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; - if (io_bitmap_ptr + port/8 > tr_seg.limit) + if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, - ctxt->vcpu, NULL); + r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, + &perm, 1, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2445,7 +2459,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt); + emulate_push(ctxt, ops); } return ret; @@ -2588,7 +2602,7 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, VCPU_SREG_ES); + emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); @@ -2600,14 +2614,14 @@ special_insn: emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, VCPU_SREG_CS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, VCPU_SREG_SS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); @@ -2619,7 +2633,7 @@ special_insn: emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, VCPU_SREG_DS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); @@ -2649,7 +2663,7 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: @@ -2658,7 +2672,7 @@ special_insn: goto done; break; case 0x60: /* pusha */ - emulate_pusha(ctxt); + emulate_pusha(ctxt, ops); break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); @@ -2672,7 +2686,7 @@ special_insn: break; case 0x68: /* push imm */ case 0x6a: /* push imm8 */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ @@ -2752,18 +2766,13 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; - case 0x8c: { /* mov r/m, sreg */ - struct kvm_segment segreg; - - if (c->modrm_reg <= VCPU_SREG_GS) - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); - else { + case 0x8c: /* mov r/m, sreg */ + if (c->modrm_reg > VCPU_SREG_GS) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } - c->dst.val = segreg.selector; + c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; - } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; @@ -2804,7 +2813,7 @@ special_insn: goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; @@ -2872,7 +2881,7 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 0xe9: /* jmp rel */ @@ -2985,11 +2994,12 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, - &c->src); + string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), + VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) - string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); + string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, + &c->dst); if (c->rep_prefix && (c->d & String)) { struct read_cache *rc = &ctxt->decode.io_read; @@ -3188,7 +3198,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, VCPU_SREG_FS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3207,7 +3217,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, VCPU_SREG_GS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); -- cgit From 0f12244fe70e8a94a491f6cd7ed70a352ab6c26c Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:31 +0300 Subject: KVM: x86 emulator: make set_cr() callback return error if it fails Make set_cr() callback return error if it fails instead of injecting #GP behind emulator's back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 10 ++- arch/x86/kvm/x86.c | 148 +++++++++++++++++++------------------ 3 files changed, 84 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df53ba2294b6..6c4f4918db5e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -135,7 +135,7 @@ struct x86_emulate_ops { unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); + int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f56ec486393e..061f7d37c9f7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2272,7 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - ops->set_cr(3, tss->cr3, ctxt->vcpu); + if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; c->regs[VCPU_REGS_RAX] = tss->eax; @@ -3135,7 +3138,10 @@ twobyte_insn: c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); + if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9a469df6011c..64c6e7a31411 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -414,57 +414,49 @@ out: return changed; } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr0 & 0xffffffff00000000UL) + return 1; #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!is_pae(vcpu)) + return 1; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - kvm_inject_gp(vcpu, 0); - return; - - } + if (cs_l) + return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; } kvm_x86_ops->set_cr0(vcpu, cr0); kvm_mmu_reset_context(vcpu); - return; + return 0; +} + +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (__kvm_set_cr0(vcpu, cr0)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr0); @@ -474,61 +466,56 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; - if (cr4 & CR4_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr4 & CR4_RESERVED_BITS) + return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!(cr4 & X86_CR4_PAE)) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; + + if (cr4 & X86_CR4_VMXE) + return 1; - if (cr4 & X86_CR4_VMXE) { - kvm_inject_gp(vcpu, 0); - return; - } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; kvm_mmu_reset_context(vcpu); + + return 0; +} + +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (__kvm_set_cr4(vcpu, cr4)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return; + return 0; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + return 1; } /* * We don't check reserved bits in nonpae mode, because @@ -546,24 +533,34 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + return 1; + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + return 0; +} + +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (__kvm_set_cr3(vcpu, cr3)) kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } } EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr8 & CR8_RESERVED_BITS) + return 1; if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; + return 0; +} + +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (__kvm_set_cr8(vcpu, cr8)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -3681,27 +3678,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) { + int res = 0; + switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; break; case 3: - kvm_set_cr3(vcpu, val); + res = __kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: - kvm_set_cr8(vcpu, val & 0xfUL); + res = __kvm_set_cr8(vcpu, val & 0xfUL); break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + res = -1; } + + return res; } static int emulator_get_cpl(struct kvm_vcpu *vcpu) -- cgit From 338dbc9781eb5acd0b12809d95d4006135f29767 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:32 +0300 Subject: KVM: x86 emulator: make (get|set)_dr() callback return error if it fails Make (get|set)_dr() callback return error if it fails instead of injecting exception behind emulator's back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++++--- arch/x86/kvm/x86.c | 63 +++++++++++++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 061f7d37c9f7..d5979ecc2521 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3151,9 +3151,14 @@ twobyte_insn: goto done; } - ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] & - ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), - ctxt->vcpu); + if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? + ~0ULL : ~0U), ctxt->vcpu) < 0) { + /* #UD condition is already handled by the code above */ + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 64c6e7a31411..44a546b136fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -573,7 +573,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { case 0 ... 3: @@ -582,29 +582,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); @@ -615,28 +607,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + int res; + + res = __kvm_set_dr(vcpu, dr, val); + if (res > 0) + kvm_queue_exception(vcpu, UD_VECTOR); + else if (res < 0) + kvm_inject_gp(vcpu, 0); + + return res; +} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ case 6: *val = vcpu->arch.dr6; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; @@ -645,6 +646,15 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 0; } + +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + if (_kvm_get_dr(vcpu, dr, val)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + return 0; +} EXPORT_SYMBOL_GPL(kvm_get_dr); static inline u32 bit(int bitno) @@ -3619,12 +3629,13 @@ int emulate_clts(struct kvm_vcpu *vcpu) int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - return kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(vcpu, dr, dest); } int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - return kvm_set_dr(vcpu, dr, value); + + return __kvm_set_dr(vcpu, dr, value); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -- cgit From e680080e653b8c8725ca620bf22a5f8480f40cb5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:33 +0300 Subject: KVM: x86 emulator: fix X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED values Currently X86EMUL_PROPAGATE_FAULT, X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED have the same value so caller cannot distinguish why function such as emulator_cmpxchg_emulated() (which can return both X86EMUL_PROPAGATE_FAULT and X86EMUL_CMPXCHG_FAILED) failed. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6c4f4918db5e..0cf4311db0d6 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -51,8 +51,9 @@ struct x86_emulate_ctxt; #define X86EMUL_UNHANDLEABLE 1 /* Terminate emulation but return success to the caller. */ #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ + struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. -- cgit From 411c35b7ef02aefb91e166ffeffad0891d955fcb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:34 +0300 Subject: KVM: fill in run->mmio details in (read|write)_emulated function Fill in run->mmio details in (read|write)_emulated function just like pio does. There is no point in filling only vcpu fields there just to copy them into vcpu->run a little bit later. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 44a546b136fc..b976c4c1fa8f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3386,9 +3386,10 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; return X86EMUL_UNHANDLEABLE; } @@ -3436,10 +3437,11 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; + memcpy(vcpu->run->mmio.data, val, bytes); return X86EMUL_CONTINUE; } @@ -3850,7 +3852,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, { int r, shadow_mask; struct decode_cache *c; - struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3937,14 +3938,6 @@ restart: return EMULATE_DO_MMIO; } - if (r || vcpu->mmio_is_write) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; - } - if (r) { if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) goto done; -- cgit From c3cd7ffaf57ae6ead5b394cebaeb76164059a57f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:35 +0300 Subject: KVM: x86 emulator: x86_emulate_insn() return -1 only in case of emulation failure Currently emulator returns -1 when emulation failed or IO is needed. Caller tries to guess whether emulation failed by looking at other variables. Make it easier for caller to recognise error condition by always returning -1 in case of failure. For this new emulator internal return value X86EMUL_IO_NEEDED is introduced. It is used to distinguish between error condition (which returns X86EMUL_UNHANDLEABLE) and condition that requires IO exit to userspace to continue emulation. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 36 ++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0cf4311db0d6..777240d4524e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -53,6 +53,7 @@ struct x86_emulate_ctxt; #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ +#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ struct x86_emulate_ops { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b976c4c1fa8f..4cb65d82abca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3275,7 +3275,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3331,7 +3331,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3391,7 +3391,7 @@ mmio: vcpu->run->mmio.len = vcpu->mmio_size = bytes; vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - return X86EMUL_UNHANDLEABLE; + return X86EMUL_IO_NEEDED; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -3863,8 +3863,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); - vcpu->mmio_is_write = 0; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -3938,24 +3936,26 @@ restart: return EMULATE_DO_MMIO; } - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - goto done; - if (!vcpu->mmio_needed) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } + if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; } - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; + if (r) { /* emulation failed */ + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + + trace_kvm_emulate_insn_failed(vcpu); + kvm_report_emulation_failure(vcpu, "mmio"); + return EMULATE_FAIL; } -done: if (vcpu->arch.exception.pending) vcpu->arch.emulate_ctxt.restart = false; -- cgit From f181b96d4c769b8915849eb9070c18116fd8d44e Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:36 +0300 Subject: KVM: remove export of emulator_write_emulated() It is not called directly outside of the file it's defined in anymore. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 ----- arch/x86/kvm/x86.c | 1 - 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 97774ae3c874..2ca1867ed97a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -628,11 +628,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu); void fx_init(struct kvm_vcpu *vcpu); -int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cb65d82abca..15a4b754a451 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3465,7 +3465,6 @@ int emulator_write_emulated(unsigned long addr, } return emulator_write_emulated_onepage(addr, val, bytes, vcpu); } -EXPORT_SYMBOL_GPL(emulator_write_emulated); #define CMPXCHG_TYPE(t, ptr, old, new) \ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) -- cgit From 8fe681e984b6505d4d12125c0776399304803ec7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:37 +0300 Subject: KVM: do not inject #PF in (read|write)_emulated() callbacks Return error to x86 emulator instead of injection exception behind its back. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 3 +++ arch/x86/kvm/emulate.c | 12 +++++++++++- arch/x86/kvm/x86.c | 28 ++++++++++++++-------------- 3 files changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 777240d4524e..b7e00cb21c6e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -94,6 +94,7 @@ struct x86_emulate_ops { int (*read_emulated)(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); /* @@ -106,6 +107,7 @@ struct x86_emulate_ops { int (*write_emulated)(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); /* @@ -120,6 +122,7 @@ struct x86_emulate_ops { const void *old, const void *new, unsigned int bytes, + unsigned int *error, struct kvm_vcpu *vcpu); int (*pio_in_emulated)(int size, unsigned short port, void *val, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d5979ecc2521..d7a18a0f80a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1277,6 +1277,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, { int rc; struct read_cache *mc = &ctxt->decode.mem_read; + u32 err; while (size) { int n = min(size, 8u); @@ -1284,7 +1285,10 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu); + rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1789,6 +1793,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, { int rc; struct decode_cache *c = &ctxt->decode; + u32 err; switch (c->dst.type) { case OP_REG: @@ -1817,13 +1822,18 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &c->dst.orig_val, &c->dst.val, c->dst.bytes, + &err, ctxt->vcpu); else rc = ops->write_emulated( (unsigned long)c->dst.ptr, &c->dst.val, c->dst.bytes, + &err, ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, + (unsigned long)c->dst.ptr, err); if (rc != X86EMUL_CONTINUE) return rc; break; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 15a4b754a451..51402d8a46fa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3346,10 +3346,10 @@ out: static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3359,12 +3359,10 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3409,17 +3407,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3449,6 +3445,7 @@ mmio: int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ @@ -3456,14 +3453,16 @@ int emulator_write_emulated(unsigned long addr, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, error_code, + vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, error_code, + vcpu); } #define CMPXCHG_TYPE(t, ptr, old, new) \ @@ -3480,6 +3479,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3533,7 +3533,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, vcpu); + return emulator_write_emulated(addr, new, bytes, error_code, vcpu); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -4293,7 +4293,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, vcpu); + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -- cgit From 3457e4192e367fd4e0da5e9f46f9df85fa99cd11 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:38 +0300 Subject: KVM: handle emulation failure case first If emulation failed return immediately. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 51402d8a46fa..9e5a833f3392 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3924,22 +3924,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - - if (r == 0) - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); - - if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } if (r) { /* emulation failed */ /* @@ -3955,6 +3939,21 @@ restart: return EMULATE_FAIL; } + shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; + kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + + if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; + return EMULATE_DO_MMIO; + } + + if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; + } + if (vcpu->arch.exception.pending) vcpu->arch.emulate_ctxt.restart = false; -- cgit From 95c5588652f7742a21c33d9dcce0e043e057d04f Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:39 +0300 Subject: KVM: x86 emulator: advance RIP outside x86 emulator code Return new RIP as part of instruction emulation result instead of updating KVM's RIP from x86 emulator code. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 ++++--- arch/x86/kvm/x86.c | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d7a18a0f80a2..437f31bcffea 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2496,8 +2496,9 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, if (rc == X86EMUL_CONTINUE) { memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); rc = writeback(ctxt, ops); + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; } return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -2554,7 +2555,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { string_done: ctxt->restart = false; - kvm_rip_write(ctxt->vcpu, c->eip); + ctxt->eip = c->eip; goto done; } /* The second termination condition only applies for REPE @@ -3032,7 +3033,7 @@ writeback: ctxt->decode.mem_read.end = 0; /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); + ctxt->eip = c->eip; ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9e5a833f3392..8f45cc712dda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3941,6 +3941,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) @@ -4945,6 +4946,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); return EMULATE_DONE; } -- cgit From ef050dc0390176ec6888f373edb776587c88be3d Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:40 +0300 Subject: KVM: x86 emulator: set RFLAGS outside x86 emulator code Removes the need for set_flags() callback. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/kvm/emulate.c | 1 - arch/x86/kvm/x86.c | 7 +------ 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index b7e00cb21c6e..a87d95f09572 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -142,7 +142,6 @@ struct x86_emulate_ops { ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 437f31bcffea..291e220c69a5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3034,7 +3034,6 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); ctxt->eip = c->eip; - ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8f45cc712dda..04ca343ee512 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3806,11 +3806,6 @@ static void emulator_set_segment_selector(u16 sel, int seg, kvm_set_segment(vcpu, &kvm_seg, seg); } -static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - kvm_x86_ops->set_rflags(vcpu, rflags); -} - static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -3829,7 +3824,6 @@ static struct x86_emulate_ops emulate_ops = { .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, - .set_rflags = emulator_set_rflags, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, .set_msr = kvm_set_msr, @@ -3941,6 +3935,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { -- cgit From bdb475a323858101f4a5ad6a1a04b1dd8885325a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:41 +0300 Subject: KVM: x86 emulator: use shadowed register in emulate_sysexit() emulate_sysexit() should use shadowed registers copy instead of looking into vcpu state directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 291e220c69a5..42cb7d71ff55 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2063,8 +2063,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + c->eip = c->regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; } -- cgit From 4d2179e1e9cb74b25a8181a506600d96e15504fb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:42 +0300 Subject: KVM: x86 emulator: handle shadowed registers outside emulator Emulator shouldn't access vcpu directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 15 --------------- arch/x86/kvm/x86.c | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 42cb7d71ff55..97a42e8c00d0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -941,12 +941,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* we cannot decode insn before we complete previous rep insn */ WARN_ON(ctxt->restart); - /* Shadow copy of register state. Committed on successful emulation. */ - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { case X86EMUL_MODE_REAL: @@ -2486,16 +2483,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); c->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) { - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); rc = writeback(ctxt, ops); if (rc == X86EMUL_CONTINUE) ctxt->eip = c->eip; @@ -2525,13 +2519,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->interruptibility = 0; ctxt->decode.mem_read.pos = 0; - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; @@ -3031,8 +3018,6 @@ writeback: * without decoding */ ctxt->decode.mem_read.end = 0; - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); ctxt->eip = c->eip; done: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04ca343ee512..21d36081a9d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3844,7 +3844,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { int r, shadow_mask; - struct decode_cache *c; + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3869,13 +3869,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ - c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -3916,6 +3917,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } + /* this is needed for vmware backdor interface to work since it + changes registers values during IO operation */ + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); @@ -3936,6 +3941,7 @@ restart: shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); if (vcpu->arch.pio.count) { @@ -4919,6 +4925,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l, ret; cache_all_regs(vcpu); @@ -4933,6 +4940,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, tss_selector, reason, has_error_code, @@ -4941,6 +4950,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); return EMULATE_DONE; -- cgit From 95cb229530f329ec8002274891793be9c91385f7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:43 +0300 Subject: KVM: x86 emulator: move interruptibility state tracking out of emulator Emulator shouldn't access vcpu directly. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++----------------- arch/x86/kvm/x86.c | 20 +++++++++++++++++--- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97a42e8c00d0..c40b40561dff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1843,20 +1843,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - ctxt->interruptibility = mask; -} - static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2516,7 +2502,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; - ctxt->interruptibility = 0; ctxt->decode.mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { @@ -2789,7 +2774,7 @@ special_insn: } if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); @@ -2958,7 +2943,7 @@ special_insn: if (emulator_bad_iopl(ctxt, ops)) kvm_inject_gp(ctxt->vcpu, 0); else { - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21d36081a9d9..91bfe7771f50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3838,12 +3838,26 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type) { - int r, shadow_mask; + int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); @@ -3871,6 +3885,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_ctxt.interruptibility = 0; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); @@ -3938,8 +3953,7 @@ restart: return EMULATE_FAIL; } - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); -- cgit From 54b8486f469475d6c8e8aec917b91239a54eb8c8 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 28 Apr 2010 19:15:44 +0300 Subject: KVM: x86 emulator: do not inject exception directly into vcpu Return exception as a result of instruction emulation and handle injection in KVM code. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 6 ++ arch/x86/kvm/emulate.c | 124 +++++++++++++++++++++++-------------- arch/x86/kvm/x86.c | 20 +++++- 3 files changed, 100 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a87d95f09572..51cfd730ac5d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -216,6 +216,12 @@ struct x86_emulate_ctxt { int interruptibility; bool restart; /* restart string instruction after writeback */ + + int exception; /* exception that happens during emulation or -1 */ + u32 error_code; /* error code for exception */ + bool error_code_valid; + unsigned long cr2; /* faulted address in case of #PF */ + /* decode cache */ struct decode_cache decode; }; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c40b40561dff..b43ac98ef790 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -653,6 +653,37 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, return seg_base(ctxt, ops, VCPU_SREG_SS); } +static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, + u32 error, bool valid) +{ + ctxt->exception = vec; + ctxt->error_code = error; + ctxt->error_code_valid = valid; + ctxt->restart = false; +} + +static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, GP_VECTOR, err, true); +} + +static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, + int err) +{ + ctxt->cr2 = addr; + emulate_exception(ctxt, PF_VECTOR, err, true); +} + +static void emulate_ud(struct x86_emulate_ctxt *ctxt) +{ + emulate_exception(ctxt, UD_VECTOR, 0, false); +} + +static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, TS_VECTOR, err, true); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -1285,7 +1316,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1366,13 +1397,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1391,14 +1422,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1517,7 +1548,7 @@ load: ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: - kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); + emulate_exception(ctxt, err_vec, err_code, true); return X86EMUL_PROPAGATE_FAULT; } @@ -1578,7 +1609,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, break; case X86EMUL_MODE_VM86: if (iopl < 3) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } change_mask |= EFLG_IF; @@ -1829,7 +1860,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, + emulate_pf(ctxt, (unsigned long)c->dst.ptr, err); if (rc != X86EMUL_CONTINUE) return rc; @@ -1883,7 +1914,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } @@ -1937,7 +1968,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } @@ -1945,7 +1976,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } @@ -1955,13 +1986,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; @@ -2004,7 +2035,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } @@ -2022,7 +2053,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } ss_sel = (u16)(msr_data + 24); @@ -2030,7 +2061,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case X86EMUL_MODE_PROT64: cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } ss_sel = cs_sel + 8; @@ -2192,7 +2223,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2202,7 +2233,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2210,7 +2241,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2223,7 +2254,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2266,7 +2297,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, int ret; if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } c->eip = tss->eip; @@ -2334,7 +2365,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2344,7 +2375,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2352,7 +2383,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2365,7 +2396,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2399,7 +2430,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } } @@ -2408,8 +2439,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { - kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, - tss_selector & 0xfffc); + emulate_ts(ctxt, tss_selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } @@ -2505,19 +2535,19 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->decode.mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* LOCK prefix is allowed only with some instructions */ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } @@ -2679,7 +2709,7 @@ special_insn: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, @@ -2691,7 +2721,7 @@ special_insn: c->src.bytes = min(c->src.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->src.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], @@ -2754,7 +2784,7 @@ special_insn: goto mov; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); @@ -2769,7 +2799,7 @@ special_insn: if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } @@ -2895,7 +2925,7 @@ special_insn: do_io_in: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, @@ -2908,7 +2938,7 @@ special_insn: do_io_out: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, @@ -2933,7 +2963,7 @@ special_insn: break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); else { ctxt->eflags &= ~X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ @@ -2941,7 +2971,7 @@ special_insn: break; case 0xfb: /* sti */ if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; @@ -3069,7 +3099,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 5: /* not defined */ - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; case 7: /* invlpg*/ emulate_invlpg(ctxt->vcpu, c->modrm_ea); @@ -3102,7 +3132,7 @@ twobyte_insn: case 1: case 5 ... 7: case 9 ... 15: - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); @@ -3111,7 +3141,7 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); @@ -3119,7 +3149,7 @@ twobyte_insn: break; case 0x22: /* mov reg, cr */ if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } c->dst.type = OP_NONE; @@ -3127,7 +3157,7 @@ twobyte_insn: case 0x23: /* mov from reg to dr */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } @@ -3135,7 +3165,7 @@ twobyte_insn: ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { /* #UD condition is already handled by the code above */ - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } @@ -3146,7 +3176,7 @@ twobyte_insn: msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } rc = X86EMUL_CONTINUE; @@ -3155,7 +3185,7 @@ twobyte_insn: case 0x32: /* rdmsr */ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 91bfe7771f50..63c87adcec48 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3852,6 +3852,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) kvm_x86_ops->set_interrupt_shadow(vcpu, mask); } +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + if (ctxt->exception == PF_VECTOR) + kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + else if (ctxt->error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); + else + kvm_queue_exception(vcpu, ctxt->exception); +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3886,6 +3897,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_ctxt.interruptibility = 0; + vcpu->arch.emulate_ctxt.exception = -1; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); @@ -3958,6 +3970,11 @@ restart: memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (vcpu->arch.emulate_ctxt.exception >= 0) { + inject_emulated_exception(vcpu); + return EMULATE_DONE; + } + if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; @@ -3970,9 +3987,6 @@ restart: return EMULATE_DO_MMIO; } - if (vcpu->arch.exception.pending) - vcpu->arch.emulate_ctxt.restart = false; - if (vcpu->arch.emulate_ctxt.restart) goto restart; -- cgit From d94e1dc9af60e3431a586c3edfbe42d8a0d3932b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 16:54:48 +0300 Subject: KVM: Get rid of KVM_REQ_KICK KVM_REQ_KICK poisons vcpu->requests by having a bit set during normal operation. This causes the fast path check for a clear vcpu->requests to fail all the time, triggering tons of atomic operations. Fix by replacing KVM_REQ_KICK with a vcpu->guest_mode atomic. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 63c87adcec48..fc5611b4007f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4604,13 +4604,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); - local_irq_disable(); + atomic_set(&vcpu->guest_mode, 1); + smp_wmb(); - clear_bit(KVM_REQ_KICK, &vcpu->requests); - smp_mb__after_clear_bit(); + local_irq_disable(); - if (vcpu->requests || need_resched() || signal_pending(current)) { - set_bit(KVM_REQ_KICK, &vcpu->requests); + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + || need_resched() || signal_pending(current)) { + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); preempt_enable(); r = 1; @@ -4655,7 +4657,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - set_bit(KVM_REQ_KICK, &vcpu->requests); + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); ++vcpu->stat.exits; @@ -5580,7 +5583,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + if (atomic_xchg(&vcpu->guest_mode, 0)) smp_send_reschedule(cpu); put_cpu(); } -- cgit From 3f10c846f8f9e6a32bbfeefaf7dda7ff51c7da29 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 May 2010 16:04:42 +0200 Subject: KVM: SVM: Dump vmcb contents on failed vmrun This patch adds a function to dump the vmcb into the kernel log and calls it after a failed vmrun to ease debugging. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..685cffff01f3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2726,6 +2726,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; +void dump_vmcb(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; + + pr_err("VMCB Control Area:\n"); + pr_err("cr_read: %04x\n", control->intercept_cr_read); + pr_err("cr_write: %04x\n", control->intercept_cr_write); + pr_err("dr_read: %04x\n", control->intercept_dr_read); + pr_err("dr_write: %04x\n", control->intercept_dr_write); + pr_err("exceptions: %08x\n", control->intercept_exceptions); + pr_err("intercepts: %016llx\n", control->intercept); + pr_err("pause filter count: %d\n", control->pause_filter_count); + pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); + pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); + pr_err("tsc_offset: %016llx\n", control->tsc_offset); + pr_err("asid: %d\n", control->asid); + pr_err("tlb_ctl: %d\n", control->tlb_ctl); + pr_err("int_ctl: %08x\n", control->int_ctl); + pr_err("int_vector: %08x\n", control->int_vector); + pr_err("int_state: %08x\n", control->int_state); + pr_err("exit_code: %08x\n", control->exit_code); + pr_err("exit_info1: %016llx\n", control->exit_info_1); + pr_err("exit_info2: %016llx\n", control->exit_info_2); + pr_err("exit_int_info: %08x\n", control->exit_int_info); + pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); + pr_err("nested_ctl: %lld\n", control->nested_ctl); + pr_err("nested_cr3: %016llx\n", control->nested_cr3); + pr_err("event_inj: %08x\n", control->event_inj); + pr_err("event_inj_err: %08x\n", control->event_inj_err); + pr_err("lbr_ctl: %lld\n", control->lbr_ctl); + pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("VMCB State Save Area:\n"); + pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); + pr_err("cpl: %d efer: %016llx\n", + save->cpl, save->efer); + pr_err("cr0: %016llx cr2: %016llx\n", + save->cr0, save->cr2); + pr_err("cr3: %016llx cr4: %016llx\n", + save->cr3, save->cr4); + pr_err("dr6: %016llx dr7: %016llx\n", + save->dr6, save->dr7); + pr_err("rip: %016llx rflags: %016llx\n", + save->rip, save->rflags); + pr_err("rsp: %016llx rax: %016llx\n", + save->rsp, save->rax); + pr_err("star: %016llx lstar: %016llx\n", + save->star, save->lstar); + pr_err("cstar: %016llx sfmask: %016llx\n", + save->cstar, save->sfmask); + pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", + save->kernel_gs_base, save->sysenter_cs); + pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", + save->sysenter_esp, save->sysenter_eip); + pr_err("gpat: %016llx dbgctl: %016llx\n", + save->g_pat, save->dbgctl); + pr_err("br_from: %016llx br_to: %016llx\n", + save->br_from, save->br_to); + pr_err("excp_from: %016llx excp_to: %016llx\n", + save->last_excp_from, save->last_excp_to); + +} + static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2770,6 +2863,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + dump_vmcb(vcpu); return 0; } -- cgit From eec4b140c924b4c650e9a89e01d223266490e325 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 5 May 2010 16:04:44 +0200 Subject: KVM: SVM: Allow EFER.LMSLE to be set with nested svm This patch enables setting of efer bit 13 which is allowed in all SVM capable processors. This is necessary for the SLES11 version of Xen 4.0 to boot with nested svm. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kvm/svm.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae4318629..509a42187dc2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -20,6 +20,7 @@ #define _EFER_LMA 10 /* Long mode active (read-only) */ #define _EFER_NX 11 /* No execute enable */ #define _EFER_SVME 12 /* Enable virtualization */ +#define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ #define EFER_SCE (1<<_EFER_SCE) @@ -27,6 +28,7 @@ #define EFER_LMA (1<<_EFER_LMA) #define EFER_NX (1<<_EFER_NX) #define EFER_SVME (1<<_EFER_SVME) +#define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 685cffff01f3..41fe0381a1ae 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void) if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } for_each_possible_cpu(cpu) { -- cgit From f3b8c964a9a6cfef6d3ca778648d53947b9fd257 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:09:21 +0800 Subject: KVM: MMU: mark page table dirty when a pte is actually modified Sometime cmpxchg_gpte doesn't modify gpte, in such case, don't mark page table page as dirty. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c7f27779c998..5c8ac060442f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -177,10 +177,10 @@ walk: if (!(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -217,11 +217,11 @@ walk: bool ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; } -- cgit From 518c5a05e89a79e498c95c3e29f29bd236b3c972 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:58:33 +0800 Subject: KVM: MMU: Fix debug output error in walk_addr() Fix a debug output error in walk_addr Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 5c8ac060442f..15e379eaf3b0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -229,7 +229,7 @@ walk: walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pt_access, pte_access); + __func__, (u64)pte, pte_access, pt_access); return 1; not_present: -- cgit From 54a4f0239f2e98bc0842818f611a4cf73bb7dd35 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:03:49 +0800 Subject: KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed Currently, kvm_mmu_zap_page() returning the number of freed children sp. This might confuse the caller, because caller don't know the actual freed number. Let's make kvm_mmu_zap_page() return the number of pages it actually freed. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b666d8d106a9..be981b1f1881 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1504,6 +1504,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { + /* Count self */ + ret++; hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { @@ -1540,7 +1542,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); used_pages -= kvm_mmu_zap_page(kvm, page); - used_pages--; } kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; @@ -2941,7 +2942,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page) + 1; + return kvm_mmu_zap_page(kvm, page); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) -- cgit From 6d77dbfc88e37c9efd5c5dd18445cfe819ae17ea Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 10 May 2010 11:16:56 +0300 Subject: KVM: inject #UD if instruction emulation fails and exit to userspace Do not kill VM when instruction emulation fails. Inject #UD and report failure to userspace instead. Userspace may choose to reenter guest if vcpu is in userspace (cpl == 3) in which case guest OS will kill offending process and continue running. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 5 +---- arch/x86/kvm/svm.c | 10 +++------- arch/x86/kvm/vmx.c | 28 ++++----------------------- arch/x86/kvm/x86.c | 43 +++++++++++++++++------------------------ 5 files changed, 26 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2ca1867ed97a..0c06148fa3b1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -576,7 +576,6 @@ enum emulation_result { #define EMULTYPE_SKIP (1 << 2) int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type); -void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be981b1f1881..4a02dee1f2b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2814,11 +2814,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - return 0; + /* fall through */ case EMULATE_FAIL: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; return 0; default: BUG(); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 41fe0381a1ae..134260c36ce2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1535,7 +1535,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -2386,16 +2386,12 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int cr8_write_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 598931734251..a82cfa1e2a40 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3070,7 +3070,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -3327,22 +3327,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - enum emulation_result er; - unsigned long offset; - - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - offset = exit_qualification & 0xffful; - - er = emulate_instruction(vcpu, 0, 0, 0); - - if (er != EMULATE_DONE) { - printk(KERN_ERR - "Fail to handle apic access vmexit! Offset is 0x%lx\n", - offset); - return -ENOEXEC; - } - return 1; + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; } static int handle_task_switch(struct kvm_vcpu *vcpu) @@ -3554,13 +3539,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - ret = 0; - goto out; - } + if (err != EMULATE_DONE) + return 0; if (signal_pending(current)) goto out; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc5611b4007f..ae9d6f3e5d0d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3639,24 +3639,6 @@ int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) return __kvm_set_dr(vcpu, dr, value); } -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) -{ - u8 opcodes[4]; - unsigned long rip = kvm_rip_read(vcpu); - unsigned long rip_linear; - - if (!printk_ratelimit()) - return; - - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); -} -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); - static u64 mk_cr_64(u64 curr_cr, u32 new_val) { return (curr_cr & ~((1ULL << 32) - 1)) | new_val; @@ -3863,6 +3845,19 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception); } +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + kvm_queue_exception(vcpu, UD_VECTOR); + return EMULATE_FAIL; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3931,11 +3926,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_SKIP) + return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } } @@ -3960,9 +3955,7 @@ restart: if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); @@ -4798,7 +4791,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r == EMULATE_DO_MMIO) { + if (r != EMULATE_DONE) { r = 0; goto out; } -- cgit From f0f5933a1626c8df7b0bfd227819c66320fb4f0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:09:56 +0300 Subject: KVM: MMU: Fix free memory accounting race in mmu_alloc_roots() We drop the mmu lock between freeing memory and allocating the roots; this allows some other vcpu to sneak in and allocate memory. While the race is benign (resulting only in temporary overallocation, not oom) it is simple and easy to fix by moving the freeing close to the allocation. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a02dee1f2b5..d7aebafffdfe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,6 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2124,6 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2496,9 +2498,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); -- cgit From 7725b89414836df492d6222b1d3cacb0ca576d77 Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:38 +0800 Subject: KVM: VMX: Define new functions to wrapper direct call of asm code Define vmcs_load() and kvm_cpu_vmxon() to avoid direct call of asm code. Also move VMXE bit operation out of kvm_cpu_vmxoff(). Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a82cfa1e2a40..823288821444 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -453,6 +453,19 @@ static void vmcs_clear(struct vmcs *vmcs) vmcs, phys_addr); } +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmcs, phys_addr); +} + static void __vcpu_clear(void *arg) { struct vcpu_vmx *vmx = arg; @@ -830,7 +843,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(vmx->vmcs); u64 tsc_this, delta, new_offset; if (vcpu->cpu != cpu) { @@ -844,15 +856,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - u8 error; - per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmx->vmcs, phys_addr); + vmcs_load(vmx->vmcs); } if (vcpu->cpu != cpu) { @@ -1288,6 +1293,13 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } +static void kvm_cpu_vmxon(u64 addr) +{ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&addr), "m"(addr) + : "memory", "cc"); +} + static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); @@ -1310,9 +1322,7 @@ static int hardware_enable(void *garbage) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&phys_addr), "m"(phys_addr) - : "memory", "cc"); + kvm_cpu_vmxon(phys_addr); ept_sync_global(); @@ -1336,13 +1346,13 @@ static void vmclear_local_vcpus(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); } static void hardware_disable(void *garbage) { vmclear_local_vcpus(); kvm_cpu_vmxoff(); + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, -- cgit From 92fe13be74303a7b80dc3c99e22e12a87d41bd5f Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:42 +0800 Subject: KVM: VMX: Some minor changes to code structure Do some preparations for vmm coexistence support. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 823288821444..0d281dbc008f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -845,15 +845,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; - if (vcpu->cpu != cpu) { + if (vcpu->cpu != cpu) vcpu_clear(vmx); - kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); - local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); - } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { per_cpu(current_vmcs, cpu) = vmx->vmcs; @@ -864,6 +857,13 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct desc_ptr dt; unsigned long sysenter_esp; + kvm_migrate_timers(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); + vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching -- cgit From b923e62e4d48bc5242b32a6ef5ba0f886137668a Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:45 +0800 Subject: KVM: VMX: VMCLEAR/VMPTRLD usage changes Originally VMCLEAR/VMPTRLD is called on vcpu migration. To support hosted VMM coexistance, VMCLEAR is executed on vcpu schedule out, and VMPTRLD is executed on vcpu schedule in. This could also eliminate the IPI when doing VMCLEAR. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0d281dbc008f..9529bff04262 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -63,6 +63,9 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) #define KVM_GUEST_CR0_MASK \ @@ -845,7 +848,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; - if (vcpu->cpu != cpu) + if (vmm_exclusive && vcpu->cpu != cpu) vcpu_clear(vmx); if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -891,6 +894,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); + if (!vmm_exclusive) + __vcpu_clear(to_vmx(vcpu)); } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) -- cgit From 4610c9cc6d9c84f7d585583699f04d5f51c83671 Mon Sep 17 00:00:00 2001 From: Dongxiao Xu Date: Tue, 11 May 2010 18:29:48 +0800 Subject: KVM: VMX: VMXON/VMXOFF usage changes SDM suggests VMXON should be called before VMPTRLD, and VMXOFF should be called after doing VMCLEAR. Therefore in vmm coexistence case, we should firstly call VMXON before any VMCS operation, and then call VMXOFF after the operation is done. Signed-off-by: Dongxiao Xu Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9529bff04262..b8aac4e98903 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -176,6 +176,8 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -847,8 +849,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tsc_this, delta, new_offset; + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - if (vmm_exclusive && vcpu->cpu != cpu) + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + else if (vcpu->cpu != cpu) vcpu_clear(vmx); if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -894,8 +899,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) + if (!vmm_exclusive) { __vcpu_clear(to_vmx(vcpu)); + kvm_cpu_vmxoff(); + } } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -1327,9 +1334,11 @@ static int hardware_enable(void *garbage) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); + if (vmm_exclusive) { + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); + } return 0; } @@ -1355,8 +1364,10 @@ static void kvm_cpu_vmxoff(void) static void hardware_disable(void *garbage) { - vmclear_local_vcpus(); - kvm_cpu_vmxoff(); + if (vmm_exclusive) { + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); + } write_cr4(read_cr4() & ~X86_CR4_VMXE); } @@ -3991,6 +4002,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vmx); } +static inline void vmcs_init(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + + vmcs_clear(vmcs); + + if (!vmm_exclusive) + kvm_cpu_vmxoff(); +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4016,7 +4040,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->vmcs) goto free_msrs; - vmcs_clear(vmx->vmcs); + vmcs_init(vmx->vmcs); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); -- cgit From 24955b6c906045382b67f3e6beba7e5df4a4a045 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 12 May 2010 21:00:35 -0300 Subject: KVM: pass correct parameter to kvm_mmu_free_some_pages Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d7aebafffdfe..a455c5eee370 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,7 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2125,7 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); -- cgit From dfb507c41d0d12fc69820abb7f040d31fcf015fe Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Tue, 11 May 2010 22:22:40 +0300 Subject: KVM: x86 emulator: Add test acc, imm instruction (opcodes 0xA8 - 0xA9) This adds test acc, imm instruction to the x86 emulator Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b43ac98ef790..35dd57c5a7fd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -181,7 +181,7 @@ static u32 opcode_table[256] = { ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ - 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, + DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ @@ -2754,6 +2754,7 @@ special_insn: } break; case 0x84 ... 0x85: + test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ @@ -2852,6 +2853,8 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); goto cmp; + case 0xa8 ... 0xa9: /* test ax, imm */ + goto test; case 0xaa ... 0xab: /* stos */ c->dst.val = c->regs[VCPU_REGS_RAX]; break; -- cgit From abc190830f28a5bb678eaccb633de02ed2967d55 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 12 May 2010 01:39:21 +0300 Subject: KVM: x86 emulator: Add missing decoder flags for sub instruction This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d) Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 35dd57c5a7fd..1b974f80e1e4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -126,7 +126,7 @@ static u32 opcode_table[256] = { /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x30 - 0x37 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit From 222b7c52c33bdef721248bfeba992af495800d30 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 12 May 2010 01:39:22 +0300 Subject: KVM: x86 emulator: Add missing decoder flags for xor instructions This adds missing decoder flags for xor instructions (opcodes 0x34 - 0x35) Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1b974f80e1e4..7a36eec8bab8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -130,7 +130,7 @@ static u32 opcode_table[256] = { /* 0x30 - 0x37 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit From 62ad07551a2ace89e35604d1c55fdae1dd3359a8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 12 May 2010 16:40:41 +0800 Subject: KVM: x86: Clean up duplicate assignment mmu.free() already set root_hpa to INVALID_PAGE, no need to do it again in the destory_kvm_mmu(). kvm_x86_ops->set_cr4() and set_efer() already assign cr4/efer to vcpu->arch.cr4/efer, no need to do it again later. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- arch/x86/kvm/x86.c | 4 +--- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a455c5eee370..c075542648cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2478,10 +2478,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + /* mmu.free() should set root_hpa = INVALID_PAGE */ vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ae9d6f3e5d0d..03039fd86980 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -486,7 +486,7 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; + kvm_mmu_reset_context(vcpu); return 0; @@ -721,8 +721,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.efer = efer; - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); -- cgit From aad827034e419fa8c5ec39e6455266f0b942d856 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 12 May 2010 16:40:42 +0800 Subject: KVM: VMX: Only reset MMU when necessary Only modifying some bits of CR0/CR4 needs paging mode switch. Modify EFER.NXE bit would result in reserved bit updates. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03039fd86980..78147f0421a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -416,6 +416,10 @@ out: static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | + X86_CR0_CD | X86_CR0_NW; + cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 @@ -449,7 +453,8 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - kvm_mmu_reset_context(vcpu); + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); return 0; } @@ -487,7 +492,8 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_x86_ops->set_cr4(vcpu, cr4); - kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & pdptr_bits) + kvm_mmu_reset_context(vcpu); return 0; } @@ -693,6 +699,8 @@ static u32 emulated_msrs[] = { static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { + u64 old_efer = vcpu->arch.efer; + if (efer & efer_reserved_bits) return 1; @@ -724,6 +732,10 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + /* Update reserved bits */ + if ((efer ^ old_efer) & EFER_NX) + kvm_mmu_reset_context(vcpu); + return 0; } -- cgit From e8ad9a707496c163312bcdd6aa3b90603d45dc9b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:06:02 +0800 Subject: KVM: MMU: use proper cache object freeing function Use kmem_cache_free to free objects allocated by kmem_cache_alloc. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c075542648cd..bb48b0ca5f8c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -305,10 +305,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, + struct kmem_cache *cache) { while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); + kmem_cache_free(cache, mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -356,10 +357,11 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -380,7 +382,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { - kfree(pc); + kmem_cache_free(pte_chain_cache, pc); } static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -391,7 +393,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { - kfree(rd); + kmem_cache_free(rmap_desc_cache, rd); } /* @@ -898,7 +900,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->link); __free_page(virt_to_page(sp->spt)); __free_page(virt_to_page(sp->gfns)); - kfree(sp); + kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } -- cgit From 6d74229f013ed8e4a00d74cfa7a3fa6a2315c467 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:07:00 +0800 Subject: KVM: MMU: remove rmap before clear spte Remove rmap before clear spte otherwise it will trigger BUG_ON() in some functions such as rmap_write_protect(). Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bb48b0ca5f8c..5c9d6df0113e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1813,6 +1813,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; + rmap_remove(vcpu->kvm, sptep); spte = shadow_trap_nonpresent_pte; goto set_pte; } -- cgit From f55c3f419ab1f0a9d66f44ceeefe752975ae4233 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:08:08 +0800 Subject: KVM: MMU: unalias gfn before sp->gfns[] comparison in sync_page sp->gfns[] contain unaliased gfns, but gpte might contain pointer to aliased region. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 15e379eaf3b0..22f13797f521 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -586,7 +586,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; - gfn_t gfn = sp->gfns[i]; + gfn_t gfn; if (!is_shadow_present_pte(sp->spt[i])) continue; @@ -597,8 +597,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || - !(gpte & PT_ACCESSED_MASK)) { + gfn = gpte_to_gfn(gpte); + if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] || + !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); -- cgit From 1683b2416e4c514d30ff5844a06733d0444ee000 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:09:57 +0800 Subject: KVM: x86: cleanup unused local variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kvm/x86.c: In function ‘handle_emulation_failure’: arch/x86/kvm/x86.c:3844: warning: unused variable ‘ctxt’ Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78147f0421a0..b05321adfd2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3857,8 +3857,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) static int handle_emulation_failure(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; -- cgit From 2122ff5eab8faec853e43f6de886e8dc8f31e317 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 11:25:04 +0300 Subject: KVM: move vcpu locking to dispatcher for generic vcpu ioctls All vcpu ioctls need to be locked, so instead of locking each one specifically we lock at the generic dispatcher. This patch only updates generic ioctls and leaves arch specific ioctls alone. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b05321adfd2f..5acd21245fc7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4773,8 +4773,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - vcpu_load(vcpu); - if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -4815,14 +4813,11 @@ out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -4845,15 +4840,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -4878,8 +4869,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - vcpu_put(vcpu); - return 0; } @@ -4898,8 +4887,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; - vcpu_load(vcpu); - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -4931,26 +4918,20 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - vcpu_put(vcpu); return 0; } @@ -4996,8 +4977,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits; struct desc_ptr dt; - vcpu_load(vcpu); - dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -5057,8 +5036,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - vcpu_put(vcpu); - return 0; } @@ -5068,12 +5045,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; - vcpu_load(vcpu); - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto unlock_out; + goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5115,8 +5090,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -unlock_out: - vcpu_put(vcpu); +out: return r; } @@ -5152,7 +5126,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -5160,7 +5133,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - vcpu_put(vcpu); return 0; } @@ -5169,8 +5141,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - vcpu_load(vcpu); - memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -5180,8 +5150,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } @@ -5189,8 +5157,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - vcpu_load(vcpu); - memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; fxsave->swd = fpu->fsw; @@ -5200,8 +5166,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } -- cgit From 526b78ad1a9e66ef240ad7c757988de039e42229 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 11:53:06 +0300 Subject: KVM: x86: Lock arch specific vcpu ioctls centrally Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 41 ++--------------------------------------- 1 file changed, 2 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5acd21245fc7..999b017011f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1541,16 +1541,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - vcpu_load(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; srcu_read_unlock(&vcpu->kvm->srcu, idx); - vcpu_put(vcpu); - return i; } @@ -1798,7 +1794,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; - vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1816,7 +1811,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1837,11 +1831,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; - vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); return 0; out: @@ -1854,7 +1846,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; - vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1866,7 +1857,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; - vcpu_put(vcpu); return r; } @@ -2098,9 +2088,7 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); return 0; } @@ -2108,11 +2096,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); - vcpu_put(vcpu); return 0; } @@ -2124,20 +2110,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; - vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - vcpu_put(vcpu); - return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_inject_nmi(vcpu); - vcpu_put(vcpu); return 0; } @@ -2157,7 +2138,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; - vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2172,7 +2152,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: - vcpu_put(vcpu); return r; } @@ -2230,8 +2209,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2256,8 +2233,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2268,8 +2243,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; - vcpu_load(vcpu); - vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2292,22 +2265,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - vcpu_put(vcpu); - return 0; } static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - vcpu_load(vcpu); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -2316,14 +2283,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - vcpu_load(vcpu); - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; - vcpu_put(vcpu); - return 0; } @@ -2335,6 +2298,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int r; struct kvm_lapic_state *lapic = NULL; + vcpu_load(vcpu); switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; @@ -2481,9 +2445,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; - vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2534,6 +2496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + vcpu_put(vcpu); kfree(lapic); return r; } -- cgit From 93736624635235cc5372ffca6d62816d02170724 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 13 May 2010 12:35:17 +0300 Subject: KVM: Consolidate arch specific vcpu ioctl locking Now that all arch specific ioctls have centralized locking, it is easy to move it to the central dispatcher. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 999b017011f4..4c2096f30d90 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2298,7 +2298,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int r; struct kvm_lapic_state *lapic = NULL; - vcpu_load(vcpu); switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; @@ -2496,7 +2495,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: - vcpu_put(vcpu); kfree(lapic); return r; } -- cgit From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:22:23 +0800 Subject: x86: Export FPU API for KVM use Also add some constants. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/i387.h | 2 ++ arch/x86/include/asm/xsave.h | 3 +++ arch/x86/kernel/i387.c | 3 ++- arch/x86/kernel/process.c | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c991b3a7b904..815c5b2b9f57 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src) memcpy(dst->state, src->state, xstate_size); } +extern void fpu_finit(struct fpu *fpu); + #endif /* __ASSEMBLY__ */ #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 2c4390cae228..29ee4e4c64cf 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,6 +13,9 @@ #define FXSAVE_SIZE 512 +#define XSTATE_YMM_SIZE 256 +#define XSTATE_YMM_OFFSET (512 + 64) + /* * These are the features that the OS can handle currently. */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b32253..c4444bce8469 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -107,7 +107,7 @@ void __cpuinit fpu_init(void) } #endif /* CONFIG_X86_64 */ -static void fpu_finit(struct fpu *fpu) +void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { @@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu) fp->fos = 0xffff0000u; } } +EXPORT_SYMBOL_GPL(fpu_finit); /* * The _current_ task is using the FPU for the first time diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..ebcfcceccc72 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit From 7cf30855e02be7a207ffebb8b9350986f2ba83e9 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:08:27 +0800 Subject: KVM: x86: Use unlazy_fpu() for host FPU We can avoid unnecessary fpu load when userspace process didn't use FPU frequently. Derived from Avi's idea. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 18 ++---------------- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0c06148fa3b1..d93601c52902 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -301,7 +301,6 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct host_fx_image; struct i387_fxsave_struct guest_fx_image; gva_t mmio_fault_cr2; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c2096f30d90..54ce77582eda 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -52,6 +52,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -5134,21 +5135,10 @@ void fx_init(struct kvm_vcpu *vcpu) { unsigned after_mxcsr_mask; - /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. - */ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); - /* Initialize guest FPU by resetting ours and saving into guest's */ preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); kvm_fx_finit(); kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); preempt_enable(); vcpu->arch.cr0 |= X86_CR0_ET; @@ -5165,7 +5155,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); + unlazy_fpu(current); kvm_fx_restore(&vcpu->arch.guest_fx_image); trace_kvm_fpu(1); } @@ -5177,7 +5167,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); @@ -5203,9 +5192,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); -- cgit From 98918833a3e21ffc5619535955e7a003cb788163 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:08:28 +0800 Subject: KVM: x86: Use FPU API Convert KVM to use generic FPU API. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 17 +------------- arch/x86/kvm/x86.c | 52 +++++++++++++---------------------------- 2 files changed, 17 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d93601c52902..d08bb4a202de 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -301,7 +301,7 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct guest_fx_image; + struct fpu guest_fpu; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -708,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline void kvm_fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} - -static inline void kvm_fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} - -static inline void kvm_fx_finit(void) -{ - asm("finit"); -} - static inline u32 get_rdx_init_val(void) { return 0x600; /* P6 family */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54ce77582eda..84b1788489de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -53,6 +53,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -5057,27 +5058,6 @@ out: return r; } -/* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - /* * Translate a guest virtual address to a guest physical address. */ @@ -5101,7 +5081,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5117,7 +5098,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5133,22 +5115,18 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) void fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - preempt_enable(); + fpu_alloc(&vcpu->arch.guest_fpu); + fpu_finit(&vcpu->arch.guest_fpu); vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) @@ -5156,7 +5134,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 1; unlazy_fpu(current); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } @@ -5166,7 +5144,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); + fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); trace_kvm_fpu(0); @@ -5179,6 +5157,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5213,6 +5192,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } -- cgit From 1d9dc7e000915b9607b480e34fcb4238b789fbb1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:51:24 +0800 Subject: KVM: MMU: split kvm_sync_page() function Split kvm_sync_page() into kvm_sync_page() and kvm_sync_page_transient() to clarify the code address Avi's suggestion kvm_sync_page_transient() function only update shadow page but not mark it sync and not write protect sp->gfn. it will be used by later patch Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c9d6df0113e..ef5d140a2705 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1199,16 +1199,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); - kvm_unlink_unsync_page(vcpu->kvm, sp); + if (clear_unsync) { + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); + } + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; @@ -1218,6 +1222,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } +static void mmu_convert_notrap(struct kvm_mmu_page *sp); +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int ret; + + ret = __kvm_sync_page(vcpu, sp, false); + if (!ret) + mmu_convert_notrap(sp); + return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + return __kvm_sync_page(vcpu, sp, true); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; -- cgit From e02aa901b1aa41fb541521800cc2a4774c162485 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:52:34 +0800 Subject: KVM: MMU: don't write-protect if have new mapping to unsync page Two cases maybe happen in kvm_mmu_get_page() function: - one case is, the goal sp is already in cache, if the sp is unsync, we only need update it to assure this mapping is valid, but not mark it sync and not write-protect sp->gfn since it not broke unsync rule(one shadow page for a gfn) - another case is, the goal sp not existed, we need create a new sp for gfn, i.e, gfn (may)has another shadow page, to keep unsync rule, we should sync(mark sync and write-protect) gfn's unsync shadow page. After enabling multiple unsync shadows, we sync those shadow pages only when the new sp not allow to become unsync(also for the unsyc rule, the new rule is: allow all pte page become unsync) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ef5d140a2705..064ddfbde108 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1337,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *unsync_sp = NULL; struct hlist_node *node, *tmp; role = vcpu->arch.mmu.base_role; @@ -1356,20 +1356,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { if (sp->unsync) - if (kvm_sync_page(vcpu, sp)) - continue; + unsync_sp = sp; if (sp->role.word != role.word) continue; + if (!direct && unsync_sp && + kvm_sync_page_transient(vcpu, unsync_sp)) { + unsync_sp = NULL; + break; + } + mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(sp); - } + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); + trace_kvm_mmu_get_page(sp, false); return sp; } + if (!direct && unsync_sp) + kvm_sync_page(vcpu, unsync_sp); + ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) -- cgit From f78978aa3a8222f7822f15fba5dbaea990ef0887 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:53:35 +0800 Subject: KVM: MMU: only update unsync page in invlpg path Only unsync pages need updated at invlpg time since other shadow pages are write-protected Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 22f13797f521..0671d7a29c3c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -461,6 +461,7 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; gpa_t pte_gpa = -1; int level; u64 *sptep; @@ -472,10 +473,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; + sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); int offset, shift; + if (!sp->unsync) + break; + shift = PAGE_SHIFT - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; offset = sp->role.quadrant << shift; @@ -493,7 +497,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) break; } - if (!is_shadow_present_pte(*sptep)) + if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } -- cgit From 9fb2d2b4ff292a01ae30da003d1dc097917b0988 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 23 May 2010 14:28:26 +0300 Subject: KVM: SVM: correctly trace irq injection On SVM interrupts are injected by svm_set_irq() not svm_inject_irq(). The later is used only to wait for irq window. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 134260c36ce2..f5c2b432078a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2917,9 +2917,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - trace_kvm_inj_virq(irq); - - ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -2933,6 +2930,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + ++vcpu->stat.irq_injections; + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } -- cgit From 221d059d15f1c8bd070a63fd45cd8d2598af5f99 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 23 May 2010 18:37:00 +0300 Subject: KVM: Update Red Hat copyrights Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/i8254.c | 1 + arch/x86/kvm/i8259.c | 1 + arch/x86/kvm/irq.c | 1 + arch/x86/kvm/lapic.c | 1 + arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/paging_tmpl.h | 1 + arch/x86/kvm/svm.c | 1 + arch/x86/kvm/timer.c | 14 ++++++++++++++ arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 11 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7a36eec8bab8..a4c2dcd10326 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,6 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Avi Kivity * Yaniv Kamay diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0150affad25d..188d82762c1b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,6 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338f..2c73f4493148 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,6 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 96dfbb6ad2a9..0f4e488331c7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,6 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eb7a4ae0c9c..d8258a0060f8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,6 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Authors: * Dor Laor diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 064ddfbde108..25d3bb2543e2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0671d7a29c3c..167f53357eef 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f5c2b432078a..02ea5cf4b1e5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,6 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 4ddadb1a5ffe..564548fbb3d6 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -1,3 +1,17 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * timer support + * + * Copyright 2010 Red Hat, Inc. and/or its affilates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + #include #include #include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b8aac4e98903..9c3ffc5fde44 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,6 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84b1788489de..033b9c207f93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity -- cgit From 9cf5cf5ad43b293581e5b87678ea5783c06d1a41 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:40:07 +0800 Subject: KVM: MMU: allow more page become unsync at gfn mapping time In current code, shadow page can become asynchronous only if one shadow page for a gfn, this rule is too strict, in fact, we can let all last mapping page(i.e, it's the pte page) become unsync, and sync them at invlpg or flush tlb time. This patch allow more page become asynchronous at gfn mapping time Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 25d3bb2543e2..ba119dae890e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1170,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, return __mmu_unsync_walk(sp, pvec); } -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: found role %x\n", - __func__, sp->role.word); - return sp; - } - return NULL; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); @@ -1759,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + trace_kvm_mmu_unsync_page(sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + + kvm_mmu_mark_parents_unsync(sp); + mmu_convert_notrap(sp); +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - unsigned index; struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; + unsigned index; - index = kvm_page_table_hashfn(sp->gfn); + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.direct) + if (s->gfn != gfn || s->role.direct || s->unsync || + s->role.invalid) continue; - if (s->role.word != sp->role.word) - return 1; + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + __kvm_unsync_page(vcpu, s); } - trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; - sp->unsync = 1; - - kvm_mmu_mark_parents_unsync(sp); - - mmu_convert_notrap(sp); - return 0; } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - struct kvm_mmu_page *shadow; + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + bool need_unsync = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || s->role.direct || s->role.invalid) + continue; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (shadow->unsync) - return 0; - if (can_unsync && oos_shadow) - return kvm_unsync_page(vcpu, shadow); - return 1; + + if (!need_unsync && !s->unsync) { + if (!can_unsync || !oos_shadow) + return 1; + need_unsync = true; + } } + if (need_unsync) + kvm_unsync_pages(vcpu, gfn); return 0; } -- cgit From 9f1a122f970dbef5ba3496587f39df5c1853083f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:41:33 +0800 Subject: KVM: MMU: allow more page become unsync at getting sp time Allow more page become asynchronous at getting sp time, if need create new shadow page for gfn but it not allow unsync(level > 1), we should unsync all gfn's unsync page Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba119dae890e..07673487fd5d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1220,6 +1220,35 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return __kvm_sync_page(vcpu, sp, true); } +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + unsigned index; + bool flush = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || !s->unsync || s->role.invalid) + continue; + + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + if ((s->role.cr4_pae != !!is_pae(vcpu)) || + (vcpu->arch.mmu.sync_page(vcpu, s))) { + kvm_mmu_zap_page(vcpu->kvm, s); + continue; + } + kvm_unlink_unsync_page(vcpu->kvm, s); + flush = true; + } + + if (flush) + kvm_mmu_flush_tlb(vcpu); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1318,8 +1347,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp, *unsync_sp = NULL; + struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; + bool need_sync = false; role = vcpu->arch.mmu.base_role; role.level = level; @@ -1336,17 +1366,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { - if (sp->unsync) - unsync_sp = sp; + if (!need_sync && sp->unsync) + need_sync = true; if (sp->role.word != role.word) continue; - if (!direct && unsync_sp && - kvm_sync_page_transient(vcpu, unsync_sp)) { - unsync_sp = NULL; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) break; - } mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { @@ -1358,9 +1385,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, false); return sp; } - if (!direct && unsync_sp) - kvm_sync_page(vcpu, unsync_sp); - ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) @@ -1371,6 +1395,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + kvm_sync_pages(vcpu, gfn); + account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) -- cgit From c8174f7b35b3018c4c7b3237ed1c792e454fd5c3 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 24 May 2010 01:01:04 +0300 Subject: KVM: VMX: Add constant for invalid guest state exit reason For the sake of completeness, this patch adds a symbolic constant for VMX exit reason 0x21 (invalid guest state). Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 9e6779f7cf2d..104cf86a7562 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -257,6 +257,7 @@ enum vmcs_field { #define EXIT_REASON_IO_INSTRUCTION 30 #define EXIT_REASON_MSR_READ 31 #define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 #define EXIT_REASON_MWAIT_INSTRUCTION 36 #define EXIT_REASON_MONITOR_INSTRUCTION 39 #define EXIT_REASON_PAUSE_INSTRUCTION 40 -- cgit From 2032a93d66fa282ba0f2ea9152eeff9511fa9a96 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:49:59 +0800 Subject: KVM: MMU: Don't allocate gfns page for direct mmu pages When sp->role.direct is set, sp->gfns does not contain any essential information, leaf sptes reachable from this sp are for a continuous guest physical memory range (a linear range). So sp->gfns[i] (if it was set) equals to sp->gfn + i. (PT_PAGE_TABLE_LEVEL) Obviously, it is not essential information, we can calculate it when need. It means we don't need sp->gfns when sp->role.direct=1, Thus we can save one page usage for every kvm_mmu_page. Note: Access to sp->gfns must be wrapped by kvm_mmu_page_get_gfn() or kvm_mmu_page_set_gfn(). It is only exposed in FNAME(sync_page). Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 38 +++++++++++++++++++++++++++++--------- arch/x86/kvm/paging_tmpl.h | 3 +++ 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07673487fd5d..f46b6c9aff27 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -397,6 +397,22 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) kmem_cache_free(rmap_desc_cache, rd); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ + if (!sp->role.direct) + return sp->gfns[index]; + + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ + if (sp->role.direct) + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); + else + sp->gfns[index] = gfn; +} + /* * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. @@ -547,7 +563,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -605,6 +621,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; pfn_t pfn; + gfn_t gfn; unsigned long *rmapp; int i; @@ -616,7 +633,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -900,7 +918,8 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); + if (!sp->role.direct) + __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } @@ -911,13 +930,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) + u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); @@ -1386,7 +1407,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte); + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; @@ -3403,7 +3424,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (*sptep & PT_WRITABLE_MASK) { rev_sp = page_header(__pa(sptep)); - gfn = rev_sp->gfns[sptep - rev_sp->spt]; + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) @@ -3417,8 +3438,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - rev_sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 167f53357eef..2ee7060a80a5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -582,6 +582,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) offset = nr_present = 0; + /* direct kvm_mmu_page can not be unsync. */ + BUG_ON(sp->role.direct); + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; -- cgit From c9fa0b3bef9a0b117b3c3f958ec553c21f609a9f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:48:25 +0800 Subject: KVM: MMU: Calculate correct base gfn for direct non-DIR level In Document/kvm/mmu.txt: gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. But in __direct_map(), the base gfn calculation is incorrect, it does not calculate correctly when level=3 or 4. Fix by using PT64_LVL_ADDR_MASK() which accounts for all levels correctly. Reported-by: Marcelo Tosatti Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f46b6c9aff27..c0350be52c91 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2020,7 +2020,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, } if (*iterator.sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + u64 base_addr = iterator.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); -- cgit From 3af1817a0d65e8c1317e8d23cfe8a91aa1d4a065 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:48:19 +0800 Subject: KVM: MMU: calculate correct gfn for small host pages backing large guest pages In Documentation/kvm/mmu.txt: gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. But in function FNAME(fetch)(), sp->gfn is incorrect when one of following situations occurred: 1) guest is 32bit paging and the guest PDE maps a 4-MByte page (backed by 4k host pages), FNAME(fetch)() miss handling the quadrant. And if guest use pse-36, "table_gfn = gpte_to_gfn(gw->ptes[level - delta]);" is incorrect. 2) guest is long mode paging and the guest PDPTE maps a 1-GByte page (backed by 4k or 2M host pages). So we fix it to suit to the document and suit to the code which requires sp->gfn correct when sp->role.direct=1. We use the goal mapping gfn(gw->gfn) to calculate the base page frame for linear translations, it is simple and easy to be understood. Reported-by: Marcelo Tosatti Reported-by: Gui Jianfeng Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2ee7060a80a5..1f7f5dd8306f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -339,10 +339,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 1; if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - delta]); - /* advance table_gfn when emulating 1gb pages with 4k */ - if (delta == 0) - table_gfn += PT_INDEX(addr, level); + /* + * It is a large guest pages backed by small host pages, + * So we set @direct(@shadow_page->role.direct)=1, and + * set @table_gfn(@shadow_page->gfn)=the base page frame + * for linear translations. + */ + table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); access &= gw->pte_access; } else { direct = 0; -- cgit From 01c168ac3d6568fed0373d82bd2db2b9339aab16 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Thu, 27 May 2010 16:09:48 +0800 Subject: KVM: MMU: don't check PT_WRITABLE_MASK directly Since we have is_writable_pte(), make use of it. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c0350be52c91..9f4be0114bce 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2990,7 +2990,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) + if (is_writable_pte(pt[i])) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); @@ -3425,7 +3425,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_mmu_page *rev_sp; gfn_t gfn; - if (*sptep & PT_WRITABLE_MASK) { + if (is_writable_pte(*sptep)) { rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); @@ -3474,7 +3474,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) if (!(ent & PT_PRESENT_MASK)) continue; - if (!(ent & PT_WRITABLE_MASK)) + if (!is_writable_pte(ent)) continue; inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } @@ -3508,7 +3508,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) spte = rmap_next(vcpu->kvm, rmapp, NULL); while (spte) { - if (*spte & PT_WRITABLE_MASK) + if (is_writable_pte(*spte)) printk(KERN_ERR "%s: (%s) shadow page has " "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, -- cgit From 6dc696d4ddf2181eefee361e1d24a49351aef1f6 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Wed, 26 May 2010 15:09:43 -1000 Subject: KVM: SVM: Fix EFER.LME being stripped Must set VCPU register to be the guest notion of EFER even if that setting is not valid on hardware. This was masked by the set in set_efer until 7657fd5ace88e8092f5f3a84117e093d7b893f26 broke that. Fix is simply to set the VCPU register before stripping bits. Signed-off-by: Zachary Amsden Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 02ea5cf4b1e5..9c68a650f57e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -286,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + vcpu->arch.efer = efer; if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.efer = efer; } static int is_external_interrupt(u32 info) -- cgit From 10ab25cd6bf7ee4e5a55d81f203f7dc1a855c27e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 25 May 2010 16:01:50 +0200 Subject: KVM: x86: Propagate fpu_alloc errors Memory allocation may fail. Propagate such errors. Signed-off-by: Jan Kiszka Reviewed-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 7 ++++++- arch/x86/kvm/vmx.c | 4 +++- arch/x86/kvm/x86.c | 11 +++++++++-- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d08bb4a202de..0cd0f2923af5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -624,7 +624,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level); void kvm_inject_nmi(struct kvm_vcpu *vcpu); -void fx_init(struct kvm_vcpu *vcpu); +int fx_init(struct kvm_vcpu *vcpu); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c68a650f57e..2ae0c3923293 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -904,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - fx_init(&svm->vcpu); + err = fx_init(&svm->vcpu); + if (err) + goto free_page4; + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9c3ffc5fde44..e71c731433ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,7 +2659,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - fx_init(&vmx->vcpu); + ret = fx_init(&vmx->vcpu); + if (ret != 0) + goto out; seg_setup(VCPU_SREG_CS); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 033b9c207f93..e6e0d7781af7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5114,12 +5114,19 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -void fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu) { - fpu_alloc(&vcpu->arch.guest_fpu); + int err; + + err = fpu_alloc(&vcpu->arch.guest_fpu); + if (err) + return err; + fpu_finit(&vcpu->arch.guest_fpu); vcpu->arch.cr0 |= X86_CR0_ET; + + return 0; } EXPORT_SYMBOL_GPL(fx_init); -- cgit From 8184dd38e22fcaec664c2b98c382b85c26780e26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 27 May 2010 14:22:51 +0300 Subject: KVM: MMU: Allow spte.w=1 for gpte.w=0 and cr0.wp=0 only in shadow mode When tdp is enabled, the guest's cr0.wp shouldn't have any effect on spte permissions. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9f4be0114bce..69d40a6e1e68 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1882,7 +1882,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + || (!tdp_enabled && write_fault && !is_write_protection(vcpu) + && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { -- cgit From b66d80006e415ee083e59c9429911eab78047f8f Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 31 May 2010 17:11:39 +0800 Subject: KVM: MMU: Don't calculate quadrant if tdp_enabled There's no need to calculate quadrant if tdp is enabled. Signed-off-by: Gui Jianfeng Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 69d40a6e1e68..d3cd102aee26 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; -- cgit From 5120702e732ed72c7055f511f8dd01de36424569 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 31 May 2010 22:40:54 +0300 Subject: KVM: VMX: Properly return error to userspace on vmentry failure The vmexit handler returns KVM_EXIT_UNKNOWN since there is no handler for vmentry failures. This intercepts vmentry failures and returns KVM_FAIL_ENTRY to userspace instead. Signed-off-by: Mohammed Gamal Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e71c731433ee..8c9085d44c37 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3665,6 +3665,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + if (unlikely(vmx->fail)) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason -- cgit From 4bc9b9828150747386130ab172f7e868e1a0fc2a Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 2 Jun 2010 14:05:24 +0800 Subject: KVM: VMX: Enforce EPT pagetable level checking We only support 4 levels EPT pagetable now. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8c9085d44c37..2201e3816205 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -340,6 +340,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -1568,7 +1573,8 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) { + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; } -- cgit From 7bee342a9e994cce7122cb187b4f3ded9d871165 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 2 Jun 2010 17:06:03 +0800 Subject: KVM: x86: use linux/uaccess.h instead of asm/uaccess.h Should use linux/uaccess.h instead of asm/uaccess.h Signed-off-by: Lai Jiangshan Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6e0d7781af7..b08c0052e332 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,13 +42,13 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS #include "trace.h" #include -#include #include #include #include -- cgit From 518c8aee5ca74fc03273fc6b4893cf456d65d545 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Fri, 4 Jun 2010 08:51:39 +0800 Subject: KVM: VMX: Make sure single type invvpid is supported before issuing invvpid instruction According to SDM, we need check whether single-context INVVPID type is supported before issuing invvpid instruction. Signed-off-by: Gui Jianfeng Reviewed-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 104cf86a7562..b4e28400c9ff 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -376,6 +376,8 @@ enum vmcs_field { #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2201e3816205..945265361885 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -360,6 +360,11 @@ static inline bool cpu_has_vmx_invept_global(void) return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -504,7 +509,8 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) if (vmx->vpid == 0) return; - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } static inline void ept_sync_global(void) -- cgit From 03116aa57e75b1bbe8b5e04f3cd21cdb6588c4ba Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:52:17 +0800 Subject: KVM: MMU: skip invalid sp when unprotect page In kvm_mmu_unprotect_page(), the invalid sp can be skipped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3cd102aee26..3ac51153bc47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1629,7 +1629,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) bucket = &kvm->arch.mmu_page_hash[index]; restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct) { + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; -- cgit From 7ae680eb2d5f0cb10ca0e6d1ff5ecb145befe8e4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:07 +0800 Subject: KVM: MMU: introduce some macros to cleanup hlist traverseing Introduce for_each_gfn_sp() and for_each_gfn_indirect_valid_sp() to cleanup hlist traverseing Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 122 +++++++++++++++++++++-------------------------------- 1 file changed, 47 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ac51153bc47..881ad918455c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1201,6 +1201,17 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ + (sp)->role.invalid) {} else + static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool clear_unsync) { @@ -1244,16 +1255,12 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) /* @gfn should be write-protected at the call site */ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; bool flush = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || !s->unsync || s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1365,9 +1372,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, u64 *parent_pte) { union kvm_mmu_page_role role; - unsigned index; unsigned quadrant; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; bool need_sync = false; @@ -1383,36 +1388,34 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) - if (sp->gfn == gfn) { - if (!need_sync && sp->unsync) - need_sync = true; + for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + if (!need_sync && sp->unsync) + need_sync = true; - if (sp->role.word != role.word) - continue; + if (sp->role.word != role.word) + continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) + break; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(sp); + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); - trace_kvm_mmu_get_page(sp, false); - return sp; - } + trace_kvm_mmu_get_page(sp, false); + return sp; + } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, bucket); + hlist_add_head(&sp->hash_link, + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -1617,46 +1620,34 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *n; int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; + } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *nn; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: zap %lx %x\n", - __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; } } @@ -1799,17 +1790,11 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; - - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->unsync || - s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); __kvm_unsync_page(vcpu, s); @@ -1819,18 +1804,11 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; bool need_unsync = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->role.invalid) - continue; - + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2703,8 +2681,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2772,13 +2748,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pte_updated = NULL; } } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) - continue; + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; -- cgit From 7775834a233478ec855b97e30727248f12eafe76 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:54 +0800 Subject: KVM: MMU: split the operations of kvm_mmu_zap_page() Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to split kvm_mmu_zap_page() function, then we can: - traverse hlist safely - easily to gather remote tlb flush which occurs during page zapped Those feature can be used in the later patches Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 52 ++++++++++++++++++++++++++++++++++++++++--------- arch/x86/kvm/mmutrace.h | 2 +- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 881ad918455c..9b849a70742d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -916,6 +916,7 @@ static int is_empty_shadow_page(u64 *spt) static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); + hlist_del(&sp->hash_link); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); if (!sp->role.direct) @@ -1200,6 +1201,10 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list); #define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ hlist_for_each_entry_safe(sp, pos, n, \ @@ -1530,7 +1535,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent) + struct kvm_mmu_page *parent, + struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; @@ -1544,7 +1550,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { - kvm_mmu_zap_page(kvm, sp); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } @@ -1554,16 +1560,16 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { int ret; - trace_kvm_mmu_zap_page(sp); + trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp); + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); - kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) @@ -1571,17 +1577,45 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->root_count) { /* Count self */ ret++; - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); + list_move(&sp->link, invalid_list); } else { - sp->role.invalid = 1; list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } + + sp->role.invalid = 1; kvm_mmu_reset_last_pte_updated(kvm); return ret; } +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(invalid_list)) + return; + + kvm_flush_remote_tlbs(kvm); + + do { + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_free_page(kvm, sp); + } while (!list_empty(invalid_list)); + +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + LIST_HEAD(invalid_list); + int ret; + + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + return ret; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 42f07b1bfbc9..3aab0f0930ef 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp) -- cgit From 103ad25a86a6ec5418b3dca6a0d2bf2ba01a8318 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:54:38 +0800 Subject: KVM: MMU: don't get free page number in the loop In the later patch, we will modify sp's zapping way like below: kvm_mmu_prepare_zap_page A kvm_mmu_prepare_zap_page B kvm_mmu_prepare_zap_page C .... kvm_mmu_commit_zap_page [ zaped multiple sps only need to call kvm_mmu_commit_zap_page once ] In __kvm_mmu_free_some_pages() function, the free page number is getted form 'vcpu->kvm->arch.n_free_mmu_pages' in loop, it will hinders us to apply kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() since kvm_mmu_prepare_zap_page() not free sp. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b849a70742d..1aad8e713f78 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2863,13 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + int free_pages; + + free_pages = vcpu->kvm->arch.n_free_mmu_pages; + while (free_pages < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } } -- cgit From d98ba053656c033180781007241f2c9d54606d56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:55:29 +0800 Subject: KVM: MMU: gather remote tlb flush which occurs during page zapped Using kvm_mmu_prepare_zap_page() and kvm_mmu_zap_page() instead of kvm_mmu_zap_page() that can reduce remote tlb flush IPI Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1aad8e713f78..44548e346976 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1200,7 +1200,6 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -1218,10 +1217,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, (sp)->role.invalid) {} else static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - bool clear_unsync) + struct list_head *invalid_list, bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1232,7 +1231,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1244,17 +1243,22 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + LIST_HEAD(invalid_list); int ret; - ret = __kvm_sync_page(vcpu, sp, false); + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); if (!ret) mmu_convert_notrap(sp); + else + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + return ret; } -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, true); + return __kvm_sync_page(vcpu, sp, invalid_list, true); } /* @gfn should be write-protected at the call site */ @@ -1262,6 +1266,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); bool flush = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { @@ -1271,13 +1276,14 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_zap_page(vcpu->kvm, s); + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } kvm_unlink_unsync_page(vcpu->kvm, s); flush = true; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) kvm_mmu_flush_tlb(vcpu); } @@ -1348,6 +1354,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; + LIST_HEAD(invalid_list); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { @@ -1360,9 +1367,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp); + kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); cond_resched_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1606,16 +1614,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - LIST_HEAD(invalid_list); - int ret; - - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(kvm, &invalid_list); - return ret; -} - /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock @@ -1623,6 +1621,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { int used_pages; + LIST_HEAD(invalid_list); used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = max(0, used_pages); @@ -1640,8 +1639,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_prepare_zap_page(kvm, page, + &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } @@ -1656,6 +1657,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); @@ -1665,9 +1667,10 @@ restart: pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; } @@ -1675,14 +1678,16 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *nn; + LIST_HEAD(invalid_list); restart: for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -2123,6 +2128,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -2132,8 +2138,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + if (!sp->root_count && sp->role.invalid) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + } vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -2146,10 +2154,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -2715,6 +2725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2801,7 +2812,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_zap_page(vcpu->kvm, sp)) + if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list)) goto restart; ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2836,6 +2848,7 @@ restart: ++spte; } } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { @@ -2864,6 +2877,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { int free_pages; + LIST_HEAD(invalid_list); free_pages = vcpu->kvm->arch.n_free_mmu_pages; while (free_pages < KVM_REFILL_PAGES && @@ -2872,9 +2886,11 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -3009,25 +3025,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page); + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) @@ -3040,6 +3059,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages, idx, freed_pages; + LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -3047,12 +3067,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } -- cgit From f41d335a02d5132c14ec0459d3b2790eeb16fb11 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:11 +0800 Subject: KVM: MMU: traverse sp hlish safely Now, we can safely to traverse sp hlish Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44548e346976..3b75689eda95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1205,13 +1205,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else @@ -1265,11 +1265,11 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (!s->unsync) continue; @@ -1387,7 +1387,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node, *tmp; + struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1401,7 +1401,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { if (!need_sync && sp->unsync) need_sync = true; @@ -1656,19 +1656,18 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; @@ -1677,15 +1676,13 @@ restart: static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *nn; + struct hlist_node *node; LIST_HEAD(invalid_list); -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -1830,9 +1827,9 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1844,10 +1841,10 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2724,7 +2721,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; @@ -2794,8 +2791,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } -restart: - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; @@ -2812,9 +2808,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } -- cgit From 0671a8e75d8aeb33e15c5152147abb0d2fa0c1e6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:59 +0800 Subject: KVM: MMU: reduce remote tlb flush in kvm_mmu_pte_write() collect remote tlb flush in kvm_mmu_pte_write() path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b75689eda95..b285449e82b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2666,11 +2666,15 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, + bool remote_flush, bool local_flush) { - if (need_remote_flush(old, new)) + if (zap_page) + return; + + if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); - else + else if (local_flush) kvm_mmu_flush_tlb(vcpu); } @@ -2735,6 +2739,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; int r; int invlpg_counter; + bool remote_flush, local_flush, zap_page; + + zap_page = remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -2808,7 +2815,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2833,16 +2840,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != sp->role.quadrant) continue; } + local_flush = true; spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); if (gentry) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); + if (!remote_flush && need_remote_flush(entry, *spte)) + remote_flush = true; ++spte; } } + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); -- cgit From b9d762fa79f541ab480cdb733b46fdb0b4471c2d Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 7 Jun 2010 10:32:29 +0800 Subject: KVM: VMX: Add all-context INVVPID type support Add all-context INVVPID type support. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index b4e28400c9ff..96a5886d384e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -377,6 +377,7 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ +#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 945265361885..622d83b0caf3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -365,6 +365,11 @@ static inline bool cpu_has_vmx_invvpid_single(void) return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; } +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -513,6 +518,20 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(struct vcpu_vmx *vmx) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_all(vmx); + else + vpid_sync_vcpu_global(); +} + static inline void ept_sync_global(void) { if (cpu_has_vmx_invept_global()) @@ -1800,7 +1819,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - vpid_sync_vcpu_all(to_vmx(vcpu)); + vpid_sync_context(to_vmx(vcpu)); if (enable_ept) ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } @@ -2756,7 +2775,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - vpid_sync_vcpu_all(vmx); + vpid_sync_context(vmx); ret = 0; -- cgit From 1760dd4939a62591e492971858fac8cce1e4539e Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 7 Jun 2010 10:33:27 +0800 Subject: KVM: VMX: rename vpid_sync_vcpu_all() to vpid_sync_vcpu_single() The name "pid_sync_vcpu_all" isn't appropriate since it just affect a single vpid, so rename it to vpid_sync_vcpu_single(). Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 622d83b0caf3..7d7361750bf2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -509,7 +509,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx) smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); } -static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) { if (vmx->vpid == 0) return; @@ -527,7 +527,7 @@ static inline void vpid_sync_vcpu_global(void) static inline void vpid_sync_context(struct vcpu_vmx *vmx) { if (cpu_has_vmx_invvpid_single()) - vpid_sync_vcpu_all(vmx); + vpid_sync_vcpu_single(vmx); else vpid_sync_vcpu_global(); } -- cgit From 4b9d3a04519fb508ad3b7ce8a7962929b2614185 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 10:15:51 +0800 Subject: KVM: VMX: fix rcu usage warning in init_rmode() fix: [ INFO: suspicious rcu_dereference_check() usage. ] --------------------------------------------------- include/linux/kvm_host.h:258 invoked rcu_dereference_check() without protection! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 1 1 lock held by qemu-system-x86/3796: #0: (&vcpu->mutex){+.+.+.}, at: [] vcpu_load+0x1a/0x66 [kvm] stack backtrace: Pid: 3796, comm: qemu-system-x86 Not tainted 2.6.34 #25 Call Trace: [] lockdep_rcu_dereference+0x9d/0xa5 [] gfn_to_memslot_unaliased+0x65/0xa0 [kvm] [] gfn_to_hva+0x22/0x4c [kvm] [] kvm_write_guest_page+0x2a/0x7f [kvm] [] kvm_clear_guest_page+0x1a/0x1c [kvm] [] init_rmode+0x3b/0x180 [kvm_intel] [] vmx_set_cr0+0x350/0x4d3 [kvm_intel] [] kvm_arch_vcpu_ioctl_set_sregs+0x122/0x31a [kvm] [] kvm_vcpu_ioctl+0x578/0xa3d [kvm] [] ? cpu_clock+0x2d/0x40 [] ? fget_light+0x244/0x28e [] ? trace_hardirqs_off_caller+0x1f/0x10e [] vfs_ioctl+0x32/0xa6 [] do_vfs_ioctl+0x47f/0x4b8 [] ? sub_preempt_count+0xa3/0xb7 [] ? fget_light+0x266/0x28e [] ? fget_light+0x111/0x28e [] sys_ioctl+0x47/0x6a [] system_call_fastpath+0x16/0x1b Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7d7361750bf2..01b054c98139 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,21 +2659,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) static int init_rmode(struct kvm *kvm) { + int idx, ret = 0; + + idx = srcu_read_lock(&kvm->srcu); if (!init_rmode_tss(kvm)) - return 0; + goto exit; if (!init_rmode_identity_map(kvm)) - return 0; - return 1; + goto exit; + + ret = 1; +exit: + srcu_read_unlock(&kvm->srcu, idx); + return ret; } static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret, idx; + int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2783,7 +2789,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } -- cgit From 4f78fd08e91c52f097d64a42d903b76fe52a3a0f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:05 +0800 Subject: KVM: MMU: remove unnecessary remote tlb flush This remote tlb flush is no necessary since we have synced while sp is zapped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b285449e82b0..098a0b8616b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3040,8 +3040,6 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); - - kvm_flush_remote_tlbs(kvm); } static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, -- cgit From 5304efde6ae27deeeae79b97af709d4ceecc336e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:57 +0800 Subject: KVM: MMU: use wrapper function to flush local tlb Use kvm_mmu_flush_tlb() function instead of calling kvm_x86_ops->tlb_flush(vcpu) directly. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 098a0b8616b0..e087f855461d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1984,7 +1984,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, reset_host_protection)) { if (write_fault) *ptwrite = 1; - kvm_x86_ops->tlb_flush(vcpu); + kvm_mmu_flush_tlb(vcpu); } pgprintk("%s: setting spte %llx\n", __func__, *sptep); -- cgit From 3b5d13218667b3ca52efa52cec1d322163bf5465 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:07:01 +0800 Subject: KVM: MMU: delay local tlb flush delay local tlb flush until enter guest moden, it can reduce vpid flush frequency and reduce remote tlb flush IPI(if KVM_REQ_TLB_FLUSH bit is already set, IPI is not sent) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e087f855461d..4706a936e36f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2364,7 +2364,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); } static void paging_new_cr3(struct kvm_vcpu *vcpu) -- cgit From a24e809902339458416900869abdcc51a44bfd48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 10 Jun 2010 13:10:55 +0200 Subject: KVM: Fix unused but set warnings No real bugs in this one. Signed-off-by: Andi Kleen Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1f7f5dd8306f..9308be2d5c02 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -444,6 +444,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); + (void)sptep; pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); -- cgit From f495c6e5e8fdc972162241df5bdff5bcebb4dc33 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:21:29 +0300 Subject: KVM: VMX: Fix incorrect rcu deref in rmode_tss_base() Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 01b054c98139..26ba61d6af8c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1688,7 +1688,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) gfn_t base_gfn; slots = kvm_memslots(kvm); - base_gfn = kvm->memslots->memslots[0].base_gfn + + base_gfn = slots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } -- cgit From 2acf923e38fb6a4ce0c57115decbb38d334902ac Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 10 Jun 2010 11:27:12 +0800 Subject: KVM: VMX: Enable XSAVE/XRSTOR for guest This patch enable guest to use XSAVE/XRSTOR instructions. We assume that host_xcr0 would use all possible bits that OS supported. And we loaded xcr0 in the same way we handled fpu - do it as late as we can. Signed-off-by: Dexuan Cui Signed-off-by: Sheng Yang Reviewed-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/kvm_cache_regs.h | 6 ++ arch/x86/kvm/vmx.c | 13 ++++ arch/x86/kvm/x86.c | 130 +++++++++++++++++++++++++++++++++++++--- 5 files changed, 145 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0cd0f2923af5..91631b8b2090 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -302,6 +302,7 @@ struct kvm_vcpu_arch { } update_pte; struct fpu guest_fpu; + u64 xcr0; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 96a5886d384e..9f0cbd987d50 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -267,6 +267,7 @@ enum vmcs_field { #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 /* * Interruption-information format diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index d2a98f8f9af5..6491ac8e755b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) return kvm_read_cr4_bits(vcpu, ~0UL); } +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} + #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 26ba61d6af8c..864a1b6d155a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include "trace.h" @@ -3390,6 +3392,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu) return 1; } +static int handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_apic_access(struct kvm_vcpu *vcpu) { return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; @@ -3668,6 +3680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b08c0052e332..b5e644701cc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -65,6 +65,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -150,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +u64 __read_mostly host_xcr0; + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -474,6 +482,61 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + u64 xcr0; + + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + vcpu->arch.xcr0 = xcr0; + vcpu->guest_xcr0_loaded = 0; + return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) + return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); + } +} + int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); @@ -482,6 +545,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (cr4 & CR4_RESERVED_BITS) return 1; + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -498,6 +564,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if ((cr4 ^ old_cr4) & pdptr_bits) kvm_mmu_reset_context(vcpu); + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + return 0; } @@ -666,11 +735,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) } EXPORT_SYMBOL_GPL(kvm_get_dr); -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. @@ -1814,6 +1878,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -1837,6 +1902,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); return 0; out: @@ -1917,7 +1983,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | @@ -1932,7 +1998,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xb); + entry->eax = min(entry->eax, (u32)0xd); break; case 1: entry->edx &= kvm_supported_word0_x86_features; @@ -1990,6 +2056,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 0xd: { + int i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (i = 1; *nent < maxnent; ++i) { + if (entry[i - 1].eax == 0 && i != 2) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; @@ -4125,6 +4205,9 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); + if (cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + return 0; out: @@ -4523,6 +4606,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -4568,6 +4670,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); atomic_set(&vcpu->guest_mode, 1); smp_wmb(); @@ -5124,6 +5227,11 @@ int fx_init(struct kvm_vcpu *vcpu) fpu_finit(&vcpu->arch.guest_fpu); + /* + * Ensure guest xcr0 is valid for loading + */ + vcpu->arch.xcr0 = XSTATE_FP; + vcpu->arch.cr0 |= X86_CR0_ET; return 0; @@ -5140,6 +5248,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) if (vcpu->guest_fpu_loaded) return; + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; unlazy_fpu(current); fpu_restore_checking(&vcpu->arch.guest_fpu); @@ -5148,6 +5262,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + kvm_put_guest_xcr0(vcpu); + if (!vcpu->guest_fpu_loaded) return; -- cgit From 49a9b07edcf4aff159c1f3d3a27e58cf38bc27cd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:14 +0300 Subject: KVM: Fix mov cr0 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 13 +++++++++++-- arch/x86/kvm/x86.c | 12 +++--------- 4 files changed, 16 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 91631b8b2090..b23708450210 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -597,7 +597,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code); -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2ae0c3923293..6d1616d47c54 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -807,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm) * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 864a1b6d155a..1baf4b2d98ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3157,11 +3157,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) + kvm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; + int err; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -3172,8 +3181,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr0(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 3: kvm_set_cr3(vcpu, val); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b5e644701cc1..05e9b5dde646 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -425,7 +425,7 @@ out: return changed; } -static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | @@ -468,17 +468,11 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_mmu_reset_context(vcpu); return 0; } - -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) -{ - if (__kvm_set_cr0(vcpu, cr0)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -3732,7 +3726,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) switch (cr) { case 0: - res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; -- cgit From a83b29c6ad6d6497e569edbc29e556a384cebddd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:15 +0300 Subject: KVM: Fix mov cr4 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++-------- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b23708450210..ea8c319cdffc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -599,7 +599,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1baf4b2d98ee..f64d65dc38c6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3189,8 +3189,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr4(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05e9b5dde646..ed3af15d4404 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -531,7 +531,7 @@ static void update_cpuid(struct kvm_vcpu *vcpu) } } -int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; @@ -563,12 +563,6 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 0; } - -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (__kvm_set_cr4(vcpu, cr4)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr4); static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) @@ -3735,7 +3729,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) res = __kvm_set_cr3(vcpu, val); break; case 4: - res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: res = __kvm_set_cr8(vcpu, val & 0xfUL); -- cgit From 2390218b6aa2eb3784b0a82fa811c19097dc793a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:16 +0300 Subject: KVM: Fix mov cr3 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++-------- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ea8c319cdffc..c2813d658f3e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -598,7 +598,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4706a936e36f..aa98fca03ed7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3203,7 +3203,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6d1616d47c54..f7a6fdcf8ef3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1963,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cr3 = hsave->save.cr3; svm->vcpu.arch.cr3 = hsave->save.cr3; } else { - kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); @@ -2086,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; } else - kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); /* Guest paging mode is active - reset mmu */ kvm_mmu_reset_context(&svm->vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f64d65dc38c6..345a35470511 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3185,8 +3185,8 @@ static int handle_cr(struct kvm_vcpu *vcpu) complete_insn_gp(vcpu, err); return 1; case 3: - kvm_set_cr3(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr3(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 4: err = kvm_set_cr4(vcpu, val); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ed3af15d4404..795999e1ac19 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -565,7 +565,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } EXPORT_SYMBOL_GPL(kvm_set_cr4); -static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); @@ -604,12 +604,6 @@ static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.mmu.new_cr3(vcpu); return 0; } - -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) -{ - if (__kvm_set_cr3(vcpu, cr3)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr3); int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) @@ -3726,7 +3720,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) vcpu->arch.cr2 = val; break; case 3: - res = __kvm_set_cr3(vcpu, val); + res = kvm_set_cr3(vcpu, val); break; case 4: res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); -- cgit From 2d5b5a665508c60577c1088e0405850a965b6795 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Sun, 13 Jun 2010 17:29:39 +0800 Subject: KVM: x86: XSAVE/XRSTOR live migration support This patch enable save/restore of xsave state. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm.h | 22 +++++++ arch/x86/include/asm/xsave.h | 7 ++- arch/x86/kvm/x86.c | 139 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index ff90055c7f0b..4d8dcbdfc120 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -22,6 +22,8 @@ #define __KVM_HAVE_XEN_HVM #define __KVM_HAVE_VCPU_EVENTS #define __KVM_HAVE_DEBUGREGS +#define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XCRS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -299,4 +301,24 @@ struct kvm_debugregs { __u64 reserved[9]; }; +/* for KVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define KVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 29ee4e4c64cf..32c36668fa7b 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -13,8 +13,11 @@ #define FXSAVE_SIZE 512 -#define XSTATE_YMM_SIZE 256 -#define XSTATE_YMM_OFFSET (512 + 64) +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) /* * These are the features that the OS can handle currently. diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 795999e1ac19..0c8dc9614e7d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1680,6 +1680,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: + case KVM_CAP_XSAVE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1703,6 +1704,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; + case KVM_CAP_XCRS: + r = cpu_has_xsave; + break; default: r = 0; break; @@ -2355,6 +2359,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + sizeof(struct xsave_struct)); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + + if (cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, sizeof(struct xsave_struct)); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct i387_fxsave_struct)); + } + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2556,6 +2631,70 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } + case KVM_GET_XSAVE: { + struct kvm_xsave *xsave; + + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave); + + r = -EFAULT; + if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + struct kvm_xsave *xsave; + + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + r = -EFAULT; + if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave); + break; + } + case KVM_GET_XCRS: { + struct kvm_xcrs *xcrs; + + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs); + + r = -EFAULT; + if (copy_to_user(argp, xcrs, + sizeof(struct kvm_xcrs))) + break; + r = 0; + break; + } + case KVM_SET_XCRS: { + struct kvm_xcrs *xcrs; + + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; + + r = -EFAULT; + if (copy_from_user(xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs); + break; + } default: r = -EINVAL; } -- cgit From ac3cd03cca91d481b41e8236aaa41a7f9fafa62f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:28:14 +0800 Subject: KVM: MMU: rename 'page' and 'shadow_page' to 'sp' Rename 'page' and 'shadow_page' to 'sp' to better fit the context Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9308be2d5c02..e461f2393d85 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -253,7 +253,7 @@ err: return 0; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { pt_element_t gpte; @@ -264,7 +264,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { if (!is_present_gpte(gpte)) { - if (page->unsync) + if (sp->unsync) new_spte = shadow_trap_nonpresent_pte; else new_spte = shadow_notrap_nonpresent_pte; @@ -273,7 +273,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) return; pfn = vcpu->arch.update_pte.pfn; @@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, * we call mmu_set_spte() with reset_host_protection = true beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } @@ -300,7 +300,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *shadow_page; + struct kvm_mmu_page *sp; u64 spte, *sptep = NULL; int direct; gfn_t table_gfn; @@ -341,9 +341,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, access &= ~ACC_WRITE_MASK; /* * It is a large guest pages backed by small host pages, - * So we set @direct(@shadow_page->role.direct)=1, and - * set @table_gfn(@shadow_page->gfn)=the base page frame - * for linear translations. + * So we set @direct(@sp->role.direct)=1, and set + * @table_gfn(@sp->gfn)=the base page frame for linear + * translations. */ table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); access &= gw->pte_access; @@ -351,21 +351,21 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 0; table_gfn = gw->table_gfn[level - 2]; } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, direct, access, sptep); if (!direct) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); if (r || curr_pte != gw->ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); + kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); sptep = NULL; break; } } - spte = __pa(shadow_page->spt) + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; *sptep = spte; -- cgit From cb83cad2e7e1cdedb2abb9cef2ac076defa679d4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:29:42 +0800 Subject: KVM: MMU: cleanup for dirty page judgment Using wrap function to cleanup page dirty judgment Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e461f2393d85..efba353369e7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -287,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, + is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } @@ -319,7 +319,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, - gw->ptes[gw->level-1] & PT_DIRTY_MASK, + is_dirty_gpte(gw->ptes[gw->level-1]), ptwrite, level, gw->gfn, pfn, false, true); break; -- cgit From f918b443527e98476c8cc45683152106b9e4bedc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:30:36 +0800 Subject: KVM: MMU: avoid double write protected in sync page path The sync page is already write protected in mmu_sync_children(), don't write protected it again Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa98fca03ed7..ff333572be75 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1216,6 +1216,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else +/* @sp->gfn should be write-protected at the call site */ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list, bool clear_unsync) { @@ -1224,11 +1225,8 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - if (clear_unsync) { - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); + if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); -- cgit From be71e061d15c0aad4f8c2606f76c57b8a19792fd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:31:38 +0800 Subject: KVM: MMU: don't mark pte notrap if it's just sync transient If the sync-sp just sync transient, don't mark its pte notrap Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 11 ++++------- arch/x86/kvm/paging_tmpl.h | 5 +++-- 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2813d658f3e..2ec2e27a403e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -241,7 +241,7 @@ struct kvm_mmu { void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp); + struct kvm_mmu_page *sp, bool clear_unsync); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); hpa_t root_hpa; int root_level; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff333572be75..d1e09f3c5614 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1103,7 +1103,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) + struct kvm_mmu_page *sp, bool clear_unsync) { return 1; } @@ -1228,7 +1228,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1237,7 +1237,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 0; } -static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1245,9 +1244,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, int ret; ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (!ret) - mmu_convert_notrap(sp); - else + if (ret) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); return ret; @@ -1273,7 +1270,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { + (vcpu->arch.mmu.sync_page(vcpu, s, true))) { kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index efba353369e7..863920f649fb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -578,7 +578,8 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * can't change unless all sptes pointing to it are nuked first. * - Alias changes zap the entire shadow cache. */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { int i, offset, nr_present; bool reset_host_protection; @@ -615,7 +616,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) u64 nonpresent; rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_gpte(gpte)) + if (is_present_gpte(gpte) || !clear_unsync) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; -- cgit From ebdea638df04ae6293a9a5414d98ad843c69e82f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:32:34 +0800 Subject: KVM: MMU: cleanup for __mmu_unsync_walk() Decrease sp->unsync_children after clear unsync_child_bitmap bit Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d1e09f3c5614..41e801b53064 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1160,9 +1160,11 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) + if (!ret) { __clear_bit(i, sp->unsync_child_bitmap); - else if (ret > 0) + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); + } else if (ret > 0) nr_unsync_leaf += ret; else return ret; @@ -1176,8 +1178,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, } } - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) - sp->unsync_children = 0; return nr_unsync_leaf; } -- cgit From 7a8f1a74e4193d21e55b35928197486f2c047efb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:34:04 +0800 Subject: KVM: MMU: clear unsync_child_bitmap completely In current code, some page's unsync_child_bitmap is not cleared completely in mmu_sync_children(), for example, if two PDPEs shard one PDT, one of PDPE's unsync_child_bitmap is not cleared. Currently, it not harm anything just little overload, but it's the prepare work for the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 41e801b53064..ab12be4eb105 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1149,33 +1149,38 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { + struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) { - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); - } else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) + goto clear_child_bitmap; + + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + goto clear_child_bitmap; + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } else if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } else + goto clear_child_bitmap; - if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } - } + continue; + +clear_child_bitmap: + __clear_bit(i, sp->unsync_child_bitmap); + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); } -- cgit From 1047df1fb682a41eb9885d6b3f2d04d6c8fd3756 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:35:15 +0800 Subject: KVM: MMU: don't walk every parent pages while mark unsync While we mark the parent's unsync_child_bitmap, if the parent is already unsynced, it no need walk it's parent, it can reduce some unnecessary workload Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 61 +++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ab12be4eb105..8c2f580956d9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -175,7 +175,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -1024,7 +1024,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } - static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; @@ -1034,63 +1033,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + fn(parent_sp, sp->parent_pte); return; } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) + u64 *spte = pte_chain->parent_ptes[i]; + + if (!spte) break; - parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + parent_sp = page_header(__pa(spte)); + fn(parent_sp, spte); } } -static void kvm_mmu_update_unsync_bitmap(u64 *spte) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - unsigned int index; - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - index = spte - sp->spt; - if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) - sp->unsync_children++; - WARN_ON(!sp->unsync_children); + mmu_parent_walk(sp, mark_unsync); } -static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; + unsigned int index; - if (!sp->parent_pte) + index = spte - sp->spt; + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; - - if (!sp->multimapped) { - kvm_mmu_update_unsync_bitmap(sp->parent_pte); + if (sp->unsync_children++) return; - } - - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); - } -} - -static int unsync_walk_fn(struct kvm_mmu_page *sp) -{ - kvm_mmu_update_parents_unsync(sp); - return 1; -} - -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ - mmu_parent_walk(sp, unsync_walk_fn); - kvm_mmu_update_parents_unsync(sp); + kvm_mmu_mark_parents_unsync(sp); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, -- cgit From bd371396b38ffc4bd6444b0203f33b99d18cedd0 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Mon, 14 Jun 2010 11:42:15 -1000 Subject: KVM: x86: fix -DDEBUG oops Fix a slight error with assertion in local APIC code. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d8258a0060f8..024f6d1c2996 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -329,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); - ASSERT(!target); + ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == 0) -- cgit From c37eda138473f8c843f2b4aa8da252fdfdaaafa3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 15 Jun 2010 09:03:33 +0800 Subject: KVM: x86 emulator: fix pusha instruction emulation emulate pusha instruction only writeback the last EDI register, but the other registers which need to be writeback is ignored. This patch fixed it. Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 133 +++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a4c2dcd10326..c990db0a3a02 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1553,6 +1553,64 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + u32 err; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, + (unsigned long)c->dst.ptr, err); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return X86EMUL_CONTINUE; +} + static inline void emulate_push(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1651,11 +1709,12 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt, +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { @@ -1663,8 +1722,18 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt, (c->src.val = old_esp) : (c->src.val = c->regs[reg]); emulate_push(ctxt, ops); + + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + ++reg; } + + /* Disable writeback. */ + c->dst.type = OP_NONE; + + return rc; } static int emulate_popa(struct x86_emulate_ctxt *ctxt, @@ -1817,64 +1886,6 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - u32 err; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); - if (rc != X86EMUL_CONTINUE) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return X86EMUL_CONTINUE; -} - static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2689,7 +2700,9 @@ special_insn: goto done; break; case 0x60: /* pusha */ - emulate_pusha(ctxt, ops); + rc = emulate_pusha(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); -- cgit From 33572ac0ad5ba5016da72e6654e607726568f9c0 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Wed, 16 Jun 2010 17:11:11 -0400 Subject: KVM: x86: Introduce a workqueue to deliver PIT timer interrupts We really want to "kvm_set_irq" during the hrtimer callback, but that is risky because that is during interrupt context. Instead, offload the work to a workqueue, which is a bit safer and should provide most of the same functionality. Signed-off-by: Chris Lalancette Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 141 +++++++++++++++++++++++++++++++-------------------- arch/x86/kvm/i8254.h | 4 +- arch/x86/kvm/irq.c | 1 - 3 files changed, 88 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 188d82762c1b..467cc47fb733 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -34,6 +34,7 @@ #include #include +#include #include "irq.h" #include "i8254.h" @@ -244,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - raw_spin_lock(&ps->inject_lock); - if (atomic_dec_return(&ps->pit_timer.pending) < 0) + int value; + + spin_lock(&ps->inject_lock); + value = atomic_dec_return(&ps->pit_timer.pending); + if (value < 0) + /* spurious acks can be generated if, for example, the + * PIC is being reset. Handle it gracefully here + */ atomic_inc(&ps->pit_timer.pending); + else if (value > 0) + /* in this case, we had multiple outstanding pit interrupts + * that we needed to inject. Reinject + */ + queue_work(ps->pit->wq, &ps->pit->expired); ps->irq_ack = 1; - raw_spin_unlock(&ps->inject_lock); + spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -264,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_timer *pt) +static void destroy_pit_timer(struct kvm_pit *pit) { - pr_debug("execute del timer!\n"); - hrtimer_cancel(&pt->timer); + hrtimer_cancel(&pit->pit_state.pit_timer.timer); + cancel_work_sync(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -281,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; +static void pit_do_work(struct work_struct *work) +{ + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); + struct kvm *kvm = pit->kvm; + struct kvm_vcpu *vcpu; + int i; + struct kvm_kpit_state *ps = &pit->pit_state; + int inject = 0; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + spin_unlock(&ps->inject_lock); + if (inject) { + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); + } +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit *pt = ktimer->kvm->arch.vpit; + + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + queue_work(pt->wq, &pt->expired); + } + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { struct kvm_timer *pt = &ps->pit_timer; @@ -292,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); + cancel_work_sync(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; - pt->timer.function = kvm_timer_fn; + pt->timer.function = pit_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -347,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) } break; default: - destroy_pit_timer(&ps->pit_timer); + destroy_pit_timer(kvm->arch.vpit); } } @@ -626,7 +692,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - raw_spin_lock_init(&pit->pit_state.inject_lock); + spin_lock_init(&pit->pit_state.inject_lock); + + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); + if (!pit->wq) { + kfree(pit); + return NULL; + } + INIT_WORK(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -685,54 +758,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); + cancel_work_sync(&kvm->arch.vpit->expired); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); + destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } - -static void __inject_pit_timer_intr(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct kvm *kvm = vcpu->kvm; - struct kvm_kpit_state *ps; - - if (pit) { - int inject = 0; - ps = &pit->pit_state; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - raw_spin_lock(&ps->inject_lock); - if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - raw_spin_unlock(&ps->inject_lock); - if (inject) - __inject_pit_timer_intr(kvm); - } -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 900d6b0ba7c2..46d08ca0b48f 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - raw_spinlock_t inject_lock; + spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -40,6 +40,8 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; + struct workqueue_struct *wq; + struct work_struct expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 0f4e488331c7..2095a049835e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); - kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); -- cgit From e7dca5c0eba63e4ba8e3586c4b37863fd7fadb5a Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Wed, 16 Jun 2010 17:11:12 -0400 Subject: KVM: x86: Allow any LAPIC to accept PIC interrupts If the guest wants to accept timer interrupts on a CPU other than the BSP, we need to remove this gate. Signed-off-by: Chris Lalancette Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 024f6d1c2996..49573c78c24b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1107,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (kvm_vcpu_is_bsp(vcpu)) { - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - } + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; return r; } -- cgit From 7d5993d63f2bac75b89e171a7098044ec4bc701f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 17 Jun 2010 17:33:55 +0800 Subject: KVM: x86 emulator: fix group3 instruction decoding Group 3 instruction with ModRM reg field as 001 is defined as test instruction under AMD arch, and emulate_grp3() is ready for emulate it, so fix the decoding. static inline int emulate_grp3(...) { ... switch (c->modrm_reg) { case 0 ... 1: /* test */ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); ... } Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c990db0a3a02..abb8cec420a2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -336,11 +336,11 @@ static u32 group_table[] = { [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM, 0, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = -- cgit From a1a005f36e0defea7c5490772c318c6af2261d31 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Jun 2010 15:47:34 +0300 Subject: KVM: Fix xsave and xcr save/restore memory leak We allocate temporary kernel buffers for these structures, but never free them. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c8dc9614e7d..d918cb15e5b5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2437,6 +2437,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; struct kvm_lapic_state *lapic = NULL; + struct kvm_xsave *xsave = NULL; + struct kvm_xcrs *xcrs = NULL; switch (ioctl) { case KVM_GET_LAPIC: { @@ -2632,8 +2634,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - struct kvm_xsave *xsave; - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!xsave) @@ -2648,8 +2648,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - struct kvm_xsave *xsave; - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!xsave) @@ -2663,8 +2661,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XCRS: { - struct kvm_xcrs *xcrs; - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!xcrs) @@ -2680,8 +2676,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XCRS: { - struct kvm_xcrs *xcrs; - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!xcrs) @@ -2700,6 +2694,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: kfree(lapic); + kfree(xsave); + kfree(xcrs); return r; } -- cgit From d1ac91d8a2f00dc6a3954f7e8971339b0893edc4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 20 Jun 2010 15:54:43 +0300 Subject: KVM: Consolidate load/save temporary buffer allocation and freeing Instead of three temporary variables and three free calls, have one temporary variable (with four names) and one free call. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d918cb15e5b5..8e60b6c9c0b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2436,25 +2436,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; - struct kvm_lapic_state *lapic = NULL; - struct kvm_xsave *xsave = NULL; - struct kvm_xcrs *xcrs = NULL; + union { + struct kvm_lapic_state *lapic; + struct kvm_xsave *xsave; + struct kvm_xcrs *xcrs; + void *buffer; + } u; + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2463,14 +2467,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; r = -EFAULT; - if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; r = 0; @@ -2634,68 +2638,66 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; - if (!xsave) + if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave); + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); r = -EFAULT; - if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave))) + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) break; r = 0; break; } case KVM_SET_XSAVE: { - xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; - if (!xsave) + if (!u.xsave) break; r = -EFAULT; - if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave))) + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) break; - r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave); + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } case KVM_GET_XCRS: { - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; - if (!xcrs) + if (!u.xcrs) break; - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs); + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); r = -EFAULT; - if (copy_to_user(argp, xcrs, + if (copy_to_user(argp, u.xcrs, sizeof(struct kvm_xcrs))) break; r = 0; break; } case KVM_SET_XCRS: { - xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; - if (!xcrs) + if (!u.xcrs) break; r = -EFAULT; - if (copy_from_user(xcrs, argp, + if (copy_from_user(u.xcrs, argp, sizeof(struct kvm_xcrs))) break; - r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs); + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } default: r = -EINVAL; } out: - kfree(lapic); - kfree(xsave); - kfree(xcrs); + kfree(u.buffer); return r; } -- cgit From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 11:44:20 +0300 Subject: KVM: Remove memory alias support As advertised in feature-removal-schedule.txt. Equivalent support is provided by overlapping memory regions. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 21 ------- arch/x86/kvm/mmu.c | 17 ++---- arch/x86/kvm/paging_tmpl.h | 3 +- arch/x86/kvm/x86.c | 125 ---------------------------------------- arch/x86/kvm/x86.h | 7 --- 5 files changed, 5 insertions(+), 168 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2ec2e27a403e..a57cdeacc4d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -69,8 +69,6 @@ #define IOPL_SHIFT 12 -#define KVM_ALIAS_SLOTS 4 - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 #define KVM_MMU_HASH_SHIFT 10 @@ -362,24 +360,7 @@ struct kvm_vcpu_arch { u64 hv_vapic; }; -struct kvm_mem_alias { - gfn_t base_gfn; - unsigned long npages; - gfn_t target_gfn; -#define KVM_ALIAS_INVALID 1UL - unsigned long flags; -}; - -#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION - -struct kvm_mem_aliases { - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; - int naliases; -}; - struct kvm_arch { - struct kvm_mem_aliases *aliases; - unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_alloc_mmu_pages; @@ -655,8 +636,6 @@ void kvm_disable_tdp(void); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); -struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); - static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8c2f580956d9..c5501bc10106 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm, struct kvm_memory_slot *slot; int *largepage_idx; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; @@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) /* * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) @@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) if (!is_rmap_spte(*spte)) return count; - gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); @@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) u64 *spte; int i, write_protected = 0; - gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); @@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->unsync) continue; - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; spte = rmap_next(vcpu->kvm, rmapp, NULL); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 863920f649fb..a21a86ef9e20 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -576,7 +576,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - Alias changes zap the entire shadow cache. */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool clear_unsync) @@ -611,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] || + if (gfn != sp->gfns[i] || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e60b6c9c0b0..62596d373a49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2740,115 +2740,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } -gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (alias->flags & KVM_ALIAS_INVALID) - continue; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) -{ - int r, n; - struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases, *old_aliases; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out; - - mutex_lock(&kvm->slots_lock); - - /* invalidate any gfn reference in case of deletion/shrinking */ - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kvm_mmu_zap_all(kvm); - kfree(old_aliases); - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out_unlock; - - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - - p = &aliases->aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; - p->flags &= ~(KVM_ALIAS_INVALID); - - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (aliases->aliases[n - 1].npages) - break; - aliases->naliases = n; - - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_aliases); - r = 0; - -out_unlock: - mutex_unlock(&kvm->slots_lock); -out: - return r; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; @@ -3056,7 +2947,6 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; - struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -3101,14 +2991,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: - r = -EFAULT; - if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); - if (r) - goto out; - break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -5559,12 +5441,6 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!kvm->arch.aliases) { - kfree(kvm); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5622,7 +5498,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); cleanup_srcu_struct(&kvm->srcu); - kfree(kvm->arch.aliases); kfree(kvm); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f4b54458285b..b7a404722d2b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } -static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) -{ - return rcu_dereference_check(kvm->arch.aliases, - srcu_read_lock_held(&kvm->srcu) - || lockdep_is_held(&kvm->slots_lock)); -} - void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -- cgit From b74a07beed0e64bfba413dcb70dd6749c57f43dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 11:48:05 +0300 Subject: KVM: Remove kernel-allocated memory regions Equivalent (and better) functionality is provided by user-allocated memory regions. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 62596d373a49..9be6e4e5e8ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2967,22 +2967,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) -- cgit From 073d46133ab0b42154f6b8429f4f66dbe2760bda Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 17:34:34 +0300 Subject: KVM: i8259: reduce excessive abstraction for pic_irq_request() Part of the i8259 code pretends it isn't part of kvm, but we know better. Reduce excessive abstraction, eliminating callbacks and void pointers. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++---------- arch/x86/kvm/irq.h | 4 ---- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 2c73f4493148..caf6e1b3d957 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,6 +34,8 @@ #include #include "trace.h" +static void pic_irq_request(struct kvm *kvm, int level); + static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { @@ -175,9 +177,9 @@ static void pic_update_irq(struct kvm_pic *s) } irq = pic_get_irq(&s->pics[0]); if (irq >= 0) - s->irq_request(s->irq_request_opaque, 1); + pic_irq_request(s->kvm, 1); else - s->irq_request(s->irq_request_opaque, 0); + pic_irq_request(s->kvm, 0); } void kvm_pic_update_irq(struct kvm_pic *s) @@ -262,8 +264,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { int irq; - struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; + struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; u8 irr = s->irr, isr = s->imr; s->last_irr = 0; @@ -302,8 +303,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) /* * deassert a pending interrupt */ - s->pics_state->irq_request(s->pics_state-> - irq_request_opaque, 0); + pic_irq_request(s->pics_state->kvm, 0); s->init_state = 1; s->init4 = val & 1; if (val & 0x02) @@ -519,9 +519,8 @@ static int picdev_read(struct kvm_io_device *this, /* * callback when PIC0 irq status changed */ -static void pic_irq_request(void *opaque, int level) +static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); @@ -550,8 +549,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; - s->irq_request = pic_irq_request; - s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413d..ffed06871c5c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -38,8 +38,6 @@ struct kvm; struct kvm_vcpu; -typedef void irq_request_func(void *opaque, int level); - struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ @@ -67,8 +65,6 @@ struct kvm_pic { unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); -- cgit From 36633f32ba4c238403d19584754b30fe469d6dcb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 May 2010 17:38:06 +0300 Subject: KVM: i8259: simplify pic_irq_request() calling sequence Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index caf6e1b3d957..bc10f0bd3819 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -176,10 +176,7 @@ static void pic_update_irq(struct kvm_pic *s) pic_set_irq1(&s->pics[0], 2, 0); } irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) - pic_irq_request(s->kvm, 1); - else - pic_irq_request(s->kvm, 0); + pic_irq_request(s->kvm, irq >= 0); } void kvm_pic_update_irq(struct kvm_pic *s) -- cgit From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:34:53 +0300 Subject: KVM: Add mini-API for vcpu->requests Makes it a little more readable and hackable. Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 27 +++++++++++++-------------- 6 files changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 49573c78c24b..77d8c0f4817d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -534,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); + kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c5501bc10106..690a7fc58c17 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_mmu_mark_parents_unsync(sp); } else if (sp->unsync) kvm_mmu_mark_parents_unsync(sp); @@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f7a6fdcf8ef3..587b99d37d44 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1494,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) */ pr_err("KVM: Guest triggered AMD Erratum 383\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); return; } diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 564548fbb3d6..e16a0dbe74d8 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -32,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); } if (waitqueue_active(q)) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 345a35470511..661c6e199b4a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -899,7 +899,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) unsigned long sysenter_esp; kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9be6e4e5e8ee..7ef44107a14a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -296,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -948,7 +948,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v) if (!vcpu->time_page) return 0; - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); return 1; } @@ -2253,7 +2253,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -4617,7 +4617,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->run->request_interrupt_window; if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); r = kvm_mmu_reload(vcpu); @@ -4625,26 +4625,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) kvm_write_guest_time(vcpu); - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_x86_ops->tlb_flush(vcpu); - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } @@ -4773,7 +4772,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: @@ -5255,7 +5254,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) vcpu->guest_fpu_loaded = 0; fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); } -- cgit From 7ac77099ce88a0c31b75acd0ec5ef3da4415a6d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 10:57:45 +0300 Subject: KVM: Prevent internal slots from being COWed If a process with a memory slot is COWed, the page will change its address (despite having an elevated reference count). This breaks internal memory slots which have their physical addresses loaded into vmcs registers (see the APIC access memory slot). Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ef44107a14a..68be38e233f5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5491,6 +5491,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5503,7 +5508,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); -- cgit From 6c3f6041172b78d5532c6bf3680d304e92ec2e66 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 22 Jun 2010 13:49:21 +0800 Subject: KVM: x86: Enable AVX for guest Enable Intel(R) Advanced Vector Extension(AVX) for guest. The detection of AVX feature includes OSXSAVE bit testing. When OSXSAVE bit is not set, even if AVX is supported, the AVX instruction would result in UD as well. So we're safe to expose AVX bits to guest directly. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 68be38e233f5..d39d6b25d3e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1963,13 +1963,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */; + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | -- cgit From 529df65e394e30a78f2633b575fd81fa5b973e30 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Mon, 21 Jun 2010 11:29:40 -0400 Subject: KVM: Search the LAPIC's for one that will accept a PIC interrupt Older versions of 32-bit linux have a "Checking 'hlt' instruction" test where they repeatedly call the 'hlt' instruction, and then expect a timer interrupt to kick the CPU out of halt. This happens before any LAPIC or IOAPIC setup happens, which means that all of the APIC's are in virtual wire mode at this point. Unfortunately, the current implementation of virtual wire mode is hardcoded to only kick the BSP, so if a crash+kexec occurs on a different vcpu, it will never get kicked. This patch makes pic_unlock() do the equivalent of kvm_irq_delivery_to_apic() for the IOAPIC code. That is, it runs through all of the vcpus looking for one that is in virtual wire mode. In the normal case where LAPICs and IOAPICs are configured, this won't be used at all. In the bootstrap phase of a modern OS, before the LAPICs and IOAPICs are configured, this will have exactly the same behavior as today; VCPU0 is always looked at first, so it will always get out of the loop after the first iteration. This will only go through the loop more than once during a kexec/kdump, in which case it will only do it a few times until the kexec'ed kernel programs the LAPIC and IOAPIC. Signed-off-by: Chris Lalancette Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index bc10f0bd3819..819b748a33f9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -46,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *found = NULL; + int i; s->wakeup_needed = false; raw_spin_unlock(&s->lock); if (wakeup) { - vcpu = s->kvm->bsp_vcpu; - if (vcpu) - kvm_vcpu_kick(vcpu); + kvm_for_each_vcpu(i, vcpu, s->kvm) { + if (kvm_apic_accept_pic_intr(vcpu)) { + found = vcpu; + break; + } + } + + if (!found) + found = s->kvm->bsp_vcpu; + + kvm_vcpu_kick(found); } } -- cgit From 3e0075094734de122e4cb09f930fa853a3c59f09 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 23 Jun 2010 14:26:18 +0300 Subject: KVM: Simplify vcpu_enter_guest() mmu reload logic slightly No need to reload the mmu in between two different vcpu->requests checks. kvm_mmu_reload() may trigger KVM_REQ_TRIPLE_FAULT, but that will be caught during atomic guest entry later. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d39d6b25d3e5..27322d341232 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4616,15 +4616,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) + if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - - if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) @@ -4649,6 +4643,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); -- cgit From f5f48ee15c2ee3e44cf429e34b16c6fa9b900246 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 30 Jun 2010 12:25:15 +0800 Subject: KVM: VMX: Execute WBINVD to keep data consistency with assigned devices Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or CLFLUSH, we need to maintain data consistency either by: 1: flushing cache (wbinvd) when the guest is scheduled out if there is no wbinvd exit, or 2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits. Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/emulate.c | 5 ++++- arch/x86/kvm/svm.c | 7 +++++++ arch/x86/kvm/vmx.c | 10 +++++++++- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a57cdeacc4d2..2bda62485c4c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -358,6 +359,8 @@ struct kvm_vcpu_arch { /* fields used by HYPER-V emulation */ u64 hv_vapic; + + cpumask_var_t wbinvd_dirty_mask; }; struct kvm_arch { @@ -514,6 +517,8 @@ struct kvm_x86_ops { void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + bool (*has_wbinvd_exit)(void); + const struct trace_print_flags *exit_reasons_str; }; @@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); int emulate_clts(struct kvm_vcpu *vcpu); +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abb8cec420a2..e8bdddc4509e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3138,8 +3138,11 @@ twobyte_insn: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; break; - case 0x08: /* invd */ case 0x09: /* wbinvd */ + kvm_emulate_wbinvd(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ c->dst.type = OP_NONE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 587b99d37d44..56c9b6bd7655 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_has_wbinvd_exit(void) +{ + return true; +} + static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .set_supported_cpuid = svm_set_supported_cpuid, + + .has_wbinvd_exit = svm_has_wbinvd_exit, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 661c6e199b4a..4dfb1dc09c88 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -3397,7 +3403,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - /* TODO: Add support for VT-d/pass-through device */ + kvm_emulate_wbinvd(vcpu); return 1; } @@ -4347,6 +4353,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .rdtscp_supported = vmx_rdtscp_supported, .set_supported_cpuid = vmx_set_supported_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27322d341232..3d72fc067059 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1783,8 +1783,28 @@ out: return r; } +static void wbinvd_ipi(void *garbage) +{ + wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + /* Address WBINVD may be executed by guest */ + if (need_emulate_wbinvd(vcpu)) { + if (kvm_x86_ops->has_wbinvd_exit()) + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) + smp_call_function_single(vcpu->cpu, + wbinvd_ipi, NULL, 1); + } + kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { unsigned long khz = cpufreq_quick_get(cpu); @@ -3660,6 +3680,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + if (!need_emulate_wbinvd(vcpu)) + return X86EMUL_CONTINUE; + + if (kvm_x86_ops->has_wbinvd_exit()) { + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + wbinvd_ipi, NULL, 1); + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); + } + wbinvd(); + return X86EMUL_CONTINUE; +} +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); + int emulate_clts(struct kvm_vcpu *vcpu) { kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); @@ -5263,6 +5298,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5392,7 +5428,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + goto fail_free_mce_banks; + return 0; +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: -- cgit From 36a2e6774bfb5f32a0f23bb155f1f960321f291b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:02:02 +0800 Subject: KVM: MMU: fix writable sync sp mapping While we sync many unsync sp at one time(in mmu_sync_children()), we may mapping the spte writable, it's dangerous, if one unsync sp's mapping gfn is another unsync page's gfn. For example: SP1.pte[0] = P SP2.gfn's pfn = P [SP1.pte[0] = SP2.gfn's pfn] First, we write protected SP1 and SP2, but SP1 and SP2 are still the unsync sp. Then, sync SP1 first, it will detect SP1.pte[0].gfn only has one unsync-sp, that is SP2, so it will mapping it writable, but we plan to sync SP2 soon, at this point, the SP2->unsync is not reliable since later we sync SP2 but SP2->gfn is already writable. So the final result is: SP2 is the sync page but SP2.gfn is writable. This bug will corrupt guest's page table, fixed by mark read-only mapping if the mapped gfn has shadow pages. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 690a7fc58c17..ca07ed083b59 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1810,11 +1810,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool need_unsync = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!can_unsync) + return 1; + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; if (!need_unsync && !s->unsync) { - if (!can_unsync || !oos_shadow) + if (!oos_shadow) return 1; need_unsync = true; } -- cgit From 5fd5387c89ec99ff6cb82d2477ffeb7211b781c2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:02:45 +0800 Subject: KVM: MMU: fix conflict access permissions in direct sp In no-direct mapping, we mark sp is 'direct' when we mapping the guest's larger page, but its access is encoded form upper page-struct entire not include the last mapping, it will cause access conflict. For example, have this mapping: [W] / PDE1 -> |---| P[W] | | LPA \ PDE2 -> |---| [R] P have two children, PDE1 and PDE2, both PDE1 and PDE2 mapping the same lage page(LPA). The P's access is WR, PDE1's access is WR, PDE2's access is RO(just consider read-write permissions here) When guest access PDE1, we will create a direct sp for LPA, the sp's access is from P, is W, then we will mark the ptes is W in this sp. Then, guest access PDE2, we will find LPA's shadow page, is the same as PDE's, and mark the ptes is RO. So, if guest access PDE1, the incorrect #PF is occured. Fixed by encode the last mapping access into direct shadow page Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a21a86ef9e20..f4e4aaa65ff3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -339,6 +339,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct = 1; if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; + access &= gw->pte_access; + /* * It is a large guest pages backed by small host pages, * So we set @direct(@sp->role.direct)=1, and set -- cgit From 9e7b0e7fba45ca3c6357aeb7091ebc281f1de365 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:03:28 +0800 Subject: KVM: MMU: fix direct sp's access corrupted If the mapping is writable but the dirty flag is not set, we will find the read-only direct sp and setup the mapping, then if the write #PF occur, we will mark this mapping writable in the read-only direct sp, now, other real read-only mapping will happily write it without #PF. It may hurt guest's COW Fixed by re-install the mapping when write #PF occur. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f4e4aaa65ff3..117d63f6304d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -325,8 +325,32 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - continue; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + unsigned direct_access; + + if (level != gw->level) + continue; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + direct_access = gw->pt_access & gw->pte_access; + if (!is_dirty_gpte(gw->ptes[gw->level - 1])) + direct_access &= ~ACC_WRITE_MASK; + + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + continue; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); -- cgit From 84754cd8fca66ed476585eabad68cacf42834199 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:05:00 +0800 Subject: KVM: MMU: cleanup FNAME(fetch)() functions Cleanup this function that we are already get the direct sp's access Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 117d63f6304d..59e750c1a269 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -306,12 +306,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, gfn_t table_gfn; int r; int level; + bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + unsigned direct_access; pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; + direct_access = gw->pt_access & gw->pte_access; + if (!dirty) + direct_access &= ~ACC_WRITE_MASK; + for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; @@ -319,15 +325,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, - is_dirty_gpte(gw->ptes[gw->level-1]), - ptwrite, level, + dirty, ptwrite, level, gw->gfn, pfn, false, true); break; } if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; - unsigned direct_access; if (level != gw->level) continue; @@ -339,10 +343,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, * so we should update the spte at this point to get * a new sp with the correct access. */ - direct_access = gw->pt_access & gw->pte_access; - if (!is_dirty_gpte(gw->ptes[gw->level - 1])) - direct_access &= ~ACC_WRITE_MASK; - child = page_header(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) continue; @@ -359,11 +359,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (level <= gw->level) { - int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_gpte(gw->ptes[level - delta])) - access &= ~ACC_WRITE_MASK; - access &= gw->pte_access; + access = direct_access; /* * It is a large guest pages backed by small host pages, -- cgit From 828554136bbacae6e39fc31b9cd7e7c660ad7530 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Jul 2010 16:00:11 +0200 Subject: KVM: Remove unnecessary divide operations This patch converts unnecessary divide and modulo operations in the KVM large page related code into logical operations. This allows to convert gfn_t to u64 while not breaking 32 bit builds. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2bda62485c4c..50c79b9f5c38 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -44,7 +44,8 @@ /* KVM Hugepage definitions for x86 */ #define KVM_NR_PAGE_SIZES 3 -#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ca07ed083b59..a20fd613acfe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -423,8 +423,8 @@ static int *slot_largepage_idx(gfn_t gfn, { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].write_count; } @@ -528,8 +528,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].rmap_pde; } -- cgit From c15a5958a0b6dbf06b3c05972694f04a0c50a4cf Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:22 -0400 Subject: x86-64, asm: Directly access per-cpu IST Use a direct per-cpu reference for the IST instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-1-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ffa..59af275b37a2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,6 +1065,7 @@ ENTRY(\sym) END(\sym) .endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) INTR_FRAME @@ -1076,10 +1077,9 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %r12) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) -- cgit From 72c511dd596cff88d6523f231a0fbb8f73006d51 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:23 -0400 Subject: x86-32, asm: Directly access per-cpu GDT Use a direct per-cpu reference for the GDT instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..233c5829e7ac 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -611,14 +611,14 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ sub %eax, %edx /* offset (low word is 0) */ - PER_CPU(gdt_page, %ebx) shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 push %eax /* new kernel esp */ @@ -791,9 +791,8 @@ ptregs_clone: * normal stack and adjusts ESP with the matching offset. */ /* fixup the stack */ - PER_CPU(gdt_page, %ebx) - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS -- cgit From dd180b3e90253cb4ca95d603a8c17413f8daec69 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 3 Jul 2010 16:02:42 +0800 Subject: KVM: VMX: fix tlb flush with invalid root Commit 341d9b535b6c simplify reload logic while entry guest mode, it can avoid unnecessary sync-root if KVM_REQ_MMU_RELOAD and KVM_REQ_MMU_SYNC both set. But, it cause a issue that when we handle 'KVM_REQ_TLB_FLUSH', the root is invalid, it is triggered during my test: Kernel BUG at ffffffffa00212b8 [verbose debug info unavailable] ...... Fixed by directly return if the root is not ready. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 2 -- arch/x86/kvm/vmx.c | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 50c79b9f5c38..502e53f999cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -40,6 +40,8 @@ 0xFFFFFF0000000000ULL) #define INVALID_PAGE (~(hpa_t)0) +#define VALID_PAGE(x) ((x) != INVALID_PAGE) + #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a20fd613acfe..70cdf6876b5f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -92,8 +92,6 @@ module_param(oos_shadow, bool, 0644); #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4dfb1dc09c88..2fdcc9819f36 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1828,8 +1828,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_context(to_vmx(vcpu)); - if (enable_ept) + if (enable_ept) { + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -- cgit From be38d276b0189fa86231fc311428622a1981ad62 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:31:27 +0300 Subject: KVM: MMU: Introduce drop_spte() When we call rmap_remove(), we (almost) always immediately follow it by an __set_spte() to a nonpresent pte. Since we need to perform the two operations atomically, to avoid losing the dirty and accessed bits, introduce a helper drop_spte() and convert all call sites. The operation is still nonatomic at this point. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 30 +++++++++++++++++------------- arch/x86/kvm/paging_tmpl.h | 13 ++++++------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70cdf6876b5f..1ad39cf70e18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + rmap_remove(kvm, sptep); + __set_spte(sptep, new_spte); +} + static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; @@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - rmap_remove(kvm, spte); + drop_spte(kvm, spte, + shadow_trap_nonpresent_pte); --kvm->stat.lpages; - __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } else { if (is_large_pte(ent)) --kvm->stat.lpages; - rmap_remove(kvm, &pt[i]); + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); } } pt[i] = shadow_trap_nonpresent_pte; @@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - rmap_remove(vcpu->kvm, sptep); - spte = shadow_trap_nonpresent_pte; - goto set_pte; + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + goto done; } spte |= PT_WRITABLE_MASK; @@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: __set_spte(sptep, spte); +done: return ret; } @@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; @@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - rmap_remove(vcpu->kvm, spte); + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 59e750c1a269..796a325c7e59 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -353,8 +353,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } if (is_large_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } @@ -516,12 +515,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; + drop_spte(vcpu->kvm, sptep, + shadow_trap_nonpresent_pte); need_flush = 1; - } - __set_spte(sptep, shadow_trap_nonpresent_pte); + } else + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } @@ -637,12 +637,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; - rmap_remove(vcpu->kvm, &sp->spt[i]); if (is_present_gpte(gpte) || !clear_unsync) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - __set_spte(&sp->spt[i], nonpresent); + drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); continue; } -- cgit From ce061867aa2877605cda96fa8ec7dff15f70a983 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:38:12 +0300 Subject: KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Since we need to make the check atomic, move it to the place that will set the new spte. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ad39cf70e18..fbdca08b8d8c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - pfn_t pfn; gfn_t gfn; unsigned long *rmapp; int i; - if (!is_rmap_spte(*spte)) - return; sp = page_header(__pa(spte)); - pfn = spte_to_pfn(*spte); - if (*spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(pfn); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { @@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { + pfn_t pfn; + + if (!is_rmap_spte(*sptep)) { + __set_spte(sptep, new_spte); + return; + } + pfn = spte_to_pfn(*sptep); + if (*sptep & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (is_writable_pte(*sptep)) + kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); __set_spte(sptep, new_spte); } -- cgit From a9221dd5ec125fbec1702fae016c6d2ea1a9a3da Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:48:06 +0300 Subject: KVM: MMU: Atomically check for accessed bit when dropping an spte Currently, in the window between the check for the accessed bit, and actually dropping the spte, a vcpu can access the page through the spte and set the bit, which will be ignored by the mmu. Fix by using an exchange operation to atmoically fetch the spte and drop it. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fbdca08b8d8c..ba2efcf2b86e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte) #endif } +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{ +#ifdef CONFIG_X86_64 + return xchg(sptep, new_spte); +#else + u64 old_spte; + + do { + old_spte = *sptep; + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + + return old_spte; +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { pfn_t pfn; + u64 old_spte; - if (!is_rmap_spte(*sptep)) { - __set_spte(sptep, new_spte); + old_spte = __xchg_spte(sptep, new_spte); + if (!is_rmap_spte(old_spte)) return; - } - pfn = spte_to_pfn(*sptep); - if (*sptep & shadow_accessed_mask) + pfn = spte_to_pfn(old_spte); + if (old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*sptep)) + if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); - __set_spte(sptep, new_spte); } static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -- cgit From b79b93f92cb3b66b89d75525fdfd2454b1e1f446 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 15:46:44 +0300 Subject: KVM: MMU: Don't drop accessed bit while updating an spte __set_spte() will happily replace an spte with the accessed bit set with one that has the accessed bit clear. Add a helper update_spte() which checks for this condition and updates the page flag if needed. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba2efcf2b86e..d8d48329cb82 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static void update_spte(u64 *sptep, u64 new_spte) +{ + u64 old_spte; + + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + __set_spte(sptep, new_spte); + } else { + old_spte = __xchg_spte(sptep, new_spte); + if (old_spte & shadow_accessed_mask) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + } +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - __set_spte(spte, *spte & ~PT_WRITABLE_MASK); + update_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte; + u64 *spte, new_spte, old_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_accessed_mask; if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); - __set_spte(spte, new_spte); + old_spte = __xchg_spte(spte, new_spte); + if (is_shadow_present_pte(old_spte) + && (old_spte & shadow_accessed_mask)) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } } @@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - __set_spte(sptep, spte); + update_spte(sptep, spte); done: return ret; } -- cgit From a5046e6c7d97d6574ffe6367311ea0b0de56aa58 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:49:05 +0800 Subject: KVM: x86 emulator: fix 'mov sreg,rm16' instruction decoding Memory reads for 'mov sreg,rm16' should be 16 bits only. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e8bdddc4509e..d842a7d2bc64 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -170,7 +170,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - ImplicitOps | SrcMem | ModRM, Group | Group1A, + ImplicitOps | SrcMem16 | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ -- cgit From ce7a0ad3bdcd86e6cf907eb5992fecb1503daa26 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:50:21 +0800 Subject: KVM: x86 emulator: fix the comment of out instruction Fix the comment of out instruction, using the same style as the other instructions. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d842a7d2bc64..ad8d7cdd1eb9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2949,8 +2949,8 @@ special_insn: &c->dst.val)) goto done; /* IO is needed */ break; - case 0xee: /* out al,dx */ - case 0xef: /* out (e/r)ax,dx */ + case 0xee: /* out dx,al */ + case 0xef: /* out dx,(e/r)ax */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_out: c->dst.bytes = min(c->dst.bytes, 4u); -- cgit From e97e883f8bfbe02cfc2bfff45e68921dfe590c7e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:51:09 +0800 Subject: KVM: x86 emulator: fix 'and AL,imm8' instruction decoding 'and AL,imm8' should be mask as ByteOp, otherwise the dest operand length will no correct and we may fill the full EAX when writeback. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ad8d7cdd1eb9..59568ad21ab3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -123,7 +123,7 @@ static u32 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- cgit From b16b2b7bb5a78afceb7fe22f2a04476cd70182b7 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:52:53 +0800 Subject: KVM: x86 emulator: fix 'mov rm,sreg' instruction decoding The source operand of 'mov rm,sreg' is segment register, not general-purpose register, so remove SrcReg from decoding. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 59568ad21ab3..8337567a0f44 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -169,7 +169,7 @@ static u32 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, ImplicitOps | SrcMem16 | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, -- cgit From 07cbc6c185aee2c0479776845988242a040c7c93 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 6 Jul 2010 16:54:19 +0800 Subject: KVM: x86 emulator: fix cli/sti instruction emulation If IOPL check fail, the cli/sti emulate GP and then we should skip writeback since the default write OP is OP_REG. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8337567a0f44..286572a5675b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2979,17 +2979,19 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt, ops)) + if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); - else { + goto done; + } else { ctxt->eflags &= ~X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt, ops)) + if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); - else { + goto done; + } else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ -- cgit From 5d55f299f97769130c6cc67896414c988db309ab Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 7 Jul 2010 17:43:35 +0800 Subject: KVM: x86 emulator: re-implementing 'mov AL,moffs' instruction decoding This patch change to use DstAcc for decoding 'mov AL, moffs' and introduced SrcAcc for decoding 'mov moffs, AL'. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 286572a5675b..255473f974ab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -70,6 +70,7 @@ #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ +#define SrcAcc (0xd<<4) /* Source Accumulator */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -177,8 +178,8 @@ static u32 opcode_table[256] = { 0, 0, SrcImmFAddr | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ @@ -1186,6 +1187,25 @@ done_prefixes: else c->src.val = insn_fetch(u8, 1, c->eip); break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->src.bytes) { + case 1: + c->src.val = *(u8 *)c->src.ptr; + break; + case 2: + c->src.val = *(u16 *)c->src.ptr; + break; + case 4: + c->src.val = *(u32 *)c->src.ptr; + break; + case 8: + c->src.val = *(u64 *)c->src.ptr; + break; + } + break; case SrcOne: c->src.bytes = 1; c->src.val = 1; @@ -2854,13 +2874,7 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; + case 0xa0 ... 0xa3: /* mov */ case 0xa4 ... 0xa5: /* movs */ goto mov; case 0xa6 ... 0xa7: /* cmps */ -- cgit From b0eeec29fe7a5b114000f769bd68ffa02652bfb7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Jul 2010 15:40:18 +0300 Subject: KVM: MMU: Only indicate a fetch fault in page fault error code if nx is enabled Bit 4 of the page fault error code is set only if EFER.NX is set. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 796a325c7e59..3a3f6d784d79 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -245,7 +245,7 @@ err: walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) + if (fetch_fault && is_nx(vcpu)) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; -- cgit From f59c1d2ded54e4bd7a9126f4a32c9eca8b336457 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 6 Jul 2010 16:20:43 +0300 Subject: KVM: MMU: Keep going on permission error Real hardware disregards permission errors when computing page fault error code bit 0 (page present). Do the same. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 52 ++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 3a3f6d784d79..1cea41cad069 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -119,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker, { pt_element_t pte; gfn_t table_gfn; - unsigned index, pt_access, pte_access; + unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - int rsvd_fault = 0; + bool eperm, present, rsvd_fault; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: + present = true; + eperm = rsvd_fault = false; walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + goto error; + } --walker->level; } #endif @@ -151,31 +155,36 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) - goto not_present; + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + present = false; + break; + } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + break; + } - rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); - if (rsvd_fault) - goto access_error; + if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + rsvd_fault = true; + break; + } if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) - goto access_error; + eperm = true; if (user_fault && !(pte & PT_USER_MASK)) - goto access_error; + eperm = true; #if PTTYPE == 64 if (fetch_fault && (pte & PT64_NX_MASK)) - goto access_error; + eperm = true; #endif - if (!(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, @@ -214,6 +223,9 @@ walk: --walker->level; } + if (!present || eperm || rsvd_fault) + goto error; + if (write_fault && !is_dirty_gpte(pte)) { bool ret; @@ -233,14 +245,10 @@ walk: __func__, (u64)pte, pte_access, pt_access); return 1; -not_present: +error: walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: + if (present) + walker->error_code |= PFERR_PRESENT_MASK; if (write_fault) walker->error_code |= PFERR_WRITE_MASK; if (user_fault) -- cgit From 673813e81d8468e80b6dd0fa839923eb9748dc49 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 7 Jul 2010 15:02:25 +0200 Subject: KVM: fix lock imbalance in kvm_create_pit() Stanse found that there is an omitted unlock in kvm_create_pit in one fail path. Add proper unlock there. Signed-off-by: Jiri Slaby Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Gleb Natapov Cc: "Michael S. Tsirkin" Cc: Gregory Haskins Cc: kvm@vger.kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 467cc47fb733..70db4d435393 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -696,6 +696,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pit->wq = create_singlethread_workqueue("kvm-pit-wq"); if (!pit->wq) { + mutex_unlock(&pit->pit_state.lock); kfree(pit); return NULL; } -- cgit From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 7 Jul 2010 20:16:45 +0300 Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area Currently if guest access address that belongs to memory slot but is not backed up by page or page is read only KVM treats it like MMIO access. Remove that capability. It was never part of the interface and should not be relied upon. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d8d48329cb82..89d7a2cae53b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) if (is_hwpoison_pfn(pfn)) { kvm_send_hwpoison_signal(kvm, gfn); return 0; - } + } else if (is_fault_pfn(pfn)) + return -EFAULT; + return 1; } -- cgit From a6f177efaa5856e22ed0d3c1e81e65b41654d083 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 8 Jul 2010 12:41:12 +0300 Subject: KVM: Reenter guest after emulation failure if due to access to non-mmio address When shadow pages are in use sometimes KVM try to emulate an instruction when it accesses a shadowed page. If emulation fails KVM un-shadows the page and reenter guest to allow vcpu to execute the instruction. If page is not in shadow page hash KVM assumes that this was attempt to do MMIO and reports emulation failure to userspace since there is no way to fix the situation. This logic has a race though. If two vcpus tries to write to the same shadowed page simultaneously both will enter emulator, but only one of them will find the page in shadow page hash since the one who founds it also removes it from there, so another cpu will report failure to userspace and will abort the guest. Fix this by checking (in addition to checking shadowed page hash) that page that caused the emulation belongs to valid memory slot. If it is then reenter the guest to allow vcpu to reexecute the instruction. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d72fc067059..d51eed239b47 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3930,6 +3930,29 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) return EMULATE_FAIL; } +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, @@ -3998,7 +4021,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4019,12 +4042,7 @@ restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); if (r) { /* emulation failed */ - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-entetr the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return handle_emulation_failure(vcpu); -- cgit From aea924f606c309feead37ab5c43f410a08ff3826 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 10 Jul 2010 17:37:56 +0800 Subject: KVM: PIT: stop vpit before freeing irq_routing Fix: general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ...... Call Trace: [] ? kvm_set_irq+0xdd/0x24b [kvm] [] ? trace_hardirqs_off_caller+0x1f/0x10e [] ? sub_preempt_count+0xe/0xb6 [] ? put_lock_stats+0xe/0x27 ... RIP [] kvm_set_irq+0x17e/0x24b [kvm] This bug is triggered when guest is shutdown, is because we freed irq_routing before pit thread stopped Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 3 +++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 70db4d435393..0fd6378981f4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -752,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &kvm->arch.vpit->speaker_dev); kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d51eed239b47..d721e2d81a53 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5523,12 +5523,12 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); -- cgit From 908e75f3e70ca580cc20442cf6780dcc2d0557b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 7 Jul 2010 14:09:38 +0300 Subject: KVM: Expose MCE control MSRs to userspace Userspace needs to reset and save/restore these MSRs. The MCE banks are not exposed since their number varies from vcpu to vcpu. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d721e2d81a53..eb55ec55125c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -744,6 +744,8 @@ static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, }; static int set_efer(struct kvm_vcpu *vcpu, u64 efer) -- cgit From 32ef26a3598636be520abed90ed0c2f439d36bbe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:04 +0300 Subject: KVM: MMU: Add link_shadow_page() helper To simplify the process of fetching an spte, add a helper that links a shadow page to an spte. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/paging_tmpl.h | 7 ++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89d7a2cae53b..df3a7a79cce3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1482,6 +1482,16 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1cea41cad069..36dc0749c878 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -309,7 +309,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *sp; - u64 spte, *sptep = NULL; + u64 *sptep = NULL; int direct; gfn_t table_gfn; int r; @@ -395,10 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } } - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + link_shadow_page(sptep, sp); } return sptep; -- cgit From 121eee97a7802acda8b78436cc53196e9885549f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:05 +0300 Subject: KVM: MMU: Use __set_spte to link shadow pages To avoid split accesses to 64 bit sptes on i386, use __set_spte() to link shadow pages together. (not technically required since shadow pages are __GFP_KERNEL, so upper 32 bits are always clear) Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df3a7a79cce3..5a6019a534a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1489,7 +1489,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + __set_spte(sptep, spte); } static void kvm_mmu_page_unlink_children(struct kvm *kvm, -- cgit From a3aa51cfaafe9179add88db20506ccb07e030b47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:06 +0300 Subject: KVM: MMU: Add drop_large_spte() helper To clarify spte fetching code, move large spte handling into a helper. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++++++ arch/x86/kvm/paging_tmpl.h | 5 +---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a6019a534a3..b75d6cb44ab6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1492,6 +1492,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) __set_spte(sptep, spte); } +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 36dc0749c878..0fb7068d64c7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -360,10 +360,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, kvm_flush_remote_tlbs(vcpu->kvm); } - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } + drop_large_spte(vcpu, sptep); if (level <= gw->level) { direct = 1; -- cgit From a357bd229cdaf37a41798d238ab50b34c71dd0d6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:07 +0300 Subject: KVM: MMU: Add validate_direct_spte() helper Add a helper to verify that a direct shadow page is valid wrt the required access permissions; drop the page if it is not valid. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 23 +++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 27 ++++++--------------------- 2 files changed, 29 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b75d6cb44ab6..36c62f33513f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1500,6 +1500,29 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) } } +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned direct_access) +{ + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + return; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0fb7068d64c7..0c7461d3a5be 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -338,30 +338,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { - struct kvm_mmu_page *child; - - if (level != gw->level) - continue; - - /* - * For the direct sp, if the guest pte's dirty bit - * changed form clean to dirty, it will corrupt the - * sp's access: allow writable in the read-only sp, - * so we should update the spte at this point to get - * a new sp with the correct access. - */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); - if (child->role.access == direct_access) - continue; - - mmu_page_remove_parent_pte(child, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) + && level == gw->level) + validate_direct_spte(vcpu, sptep, direct_access); drop_large_spte(vcpu, sptep); + if (is_shadow_present_pte(*sptep)) + continue; + if (level <= gw->level) { direct = 1; access = direct_access; -- cgit From 39c8c672a18c52048343d7531dfb2dcf3431ee74 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:08 +0300 Subject: KVM: MMU: Add gpte_valid() helper Move the code to check whether a gpte has changed since we fetched it into a helper. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 0c7461d3a5be..e1c1f9eb1cc1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -299,6 +299,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gpte_to_gfn(gpte), pfn, true, true); } +static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, + struct guest_walker *gw, int level) +{ + int r; + pt_element_t curr_pte; + + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -312,11 +323,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, u64 *sptep = NULL; int direct; gfn_t table_gfn; - int r; int level; bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); unsigned direct_access; - pt_element_t curr_pte; struct kvm_shadow_walk_iterator iterator; if (!is_present_gpte(gw->ptes[gw->level - 1])) @@ -365,17 +374,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, direct, access, sptep); - if (!direct) { - r = kvm_read_guest_atomic(vcpu->kvm, - gw->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw->ptes[level - 2]) { + if (!direct) + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, level - 1)) { kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); sptep = NULL; break; } - } link_shadow_page(sptep, sp); } -- cgit From 0b3c933302262d83018dd5f69656bca9f28a0cd3 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:09 +0300 Subject: KVM: MMU: Simplify spte fetch() function Partition the function into three sections: - fetching indirect shadow pages (host_level > guest_level) - fetching direct shadow pages (page_level < host_level <= guest_level) - the final spte (page_level == host_level) Instead of the current spaghetti. A slight change from the original code is that we call validate_direct_spte() more often: previously we called it only for gw->level, now we also call it for lower levels. The change should have no effect. [xiao: fix regression caused by validate_direct_spte() called too late] Signed-off-by: Avi Kivity Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 93 ++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e1c1f9eb1cc1..368e4cb6233b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -321,9 +321,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, unsigned access = gw->pt_access; struct kvm_mmu_page *sp; u64 *sptep = NULL; - int direct; - gfn_t table_gfn; - int level; + int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); unsigned direct_access; struct kvm_shadow_walk_iterator iterator; @@ -335,61 +333,68 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!dirty) direct_access &= ~ACC_WRITE_MASK; - for_each_shadow_entry(vcpu, addr, iterator) { + for (shadow_walk_init(&iterator, vcpu, addr); + shadow_walk_okay(&iterator) && iterator.level > gw->level; + shadow_walk_next(&iterator)) { + gfn_t table_gfn; + level = iterator.level; sptep = iterator.sptep; - if (iterator.level == hlevel) { - mmu_set_spte(vcpu, sptep, access, - gw->pte_access & access, - user_fault, write_fault, - dirty, ptwrite, level, - gw->gfn, pfn, false, true); - break; - } - - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep) - && level == gw->level) - validate_direct_spte(vcpu, sptep, direct_access); drop_large_spte(vcpu, sptep); if (is_shadow_present_pte(*sptep)) continue; - if (level <= gw->level) { - direct = 1; - access = direct_access; - - /* - * It is a large guest pages backed by small host pages, - * So we set @direct(@sp->role.direct)=1, and set - * @table_gfn(@sp->gfn)=the base page frame for linear - * translations. - */ - table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); - access &= gw->pte_access; - } else { - direct = 0; - table_gfn = gw->table_gfn[level - 2]; - } + table_gfn = gw->table_gfn[level - 2]; sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - direct, access, sptep); - if (!direct) - /* - * Verify that the gpte in the page we've just write - * protected is still there. - */ - if (FNAME(gpte_changed)(vcpu, gw, level - 1)) { - kvm_mmu_put_page(sp, sptep); - kvm_release_pfn_clean(pfn); - sptep = NULL; - break; - } + false, access, sptep); + + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, level - 1)) + goto out_gpte_changed; + + link_shadow_page(sptep, sp); + } + + for (; + shadow_walk_okay(&iterator) && iterator.level > hlevel; + shadow_walk_next(&iterator)) { + gfn_t direct_gfn; + level = iterator.level; + sptep = iterator.sptep; + + validate_direct_spte(vcpu, sptep, direct_access); + + drop_large_spte(vcpu, sptep); + + if (is_shadow_present_pte(*sptep)) + continue; + + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1, + true, direct_access, sptep); link_shadow_page(sptep, sp); } + sptep = iterator.sptep; + level = iterator.level; + + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, level, + gw->gfn, pfn, false, true); + return sptep; + +out_gpte_changed: + kvm_mmu_put_page(sp, sptep); + kvm_release_pfn_clean(pfn); + return NULL; } /* -- cgit From 5991b33237b7fc7dd9f62ae04998c42217d444a7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:10 +0300 Subject: KVM: MMU: Validate all gptes during fetch, not just those used for new pages Currently, when we fetch an spte, we only verify that gptes match those that the walker saw if we build new shadow pages for them. However, this misses the following race: vcpu1 vcpu2 walk change gpte walk instantiate sp fetch existing sp Fix by validating every gpte, regardless of whether it is used for building a new sp or not. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 368e4cb6233b..8cb85f9c8adb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -319,10 +319,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp = NULL; u64 *sptep = NULL; int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + int top_level; unsigned direct_access; struct kvm_shadow_walk_iterator iterator; @@ -333,6 +334,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (!dirty) direct_access &= ~ACC_WRITE_MASK; + top_level = vcpu->arch.mmu.root_level; + if (top_level == PT32E_ROOT_LEVEL) + top_level = PT32_ROOT_LEVEL; + /* + * Verify that the top-level gpte is still there. Since the page + * is a root page, it is either write protected (and cannot be + * changed from now on) or it is invalid (in which case, we don't + * really care if it changes underneath us after this point). + */ + if (FNAME(gpte_changed)(vcpu, gw, top_level)) + goto out_gpte_changed; + for (shadow_walk_init(&iterator, vcpu, addr); shadow_walk_okay(&iterator) && iterator.level > gw->level; shadow_walk_next(&iterator)) { @@ -343,12 +356,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, drop_large_spte(vcpu, sptep); - if (is_shadow_present_pte(*sptep)) - continue; - - table_gfn = gw->table_gfn[level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - false, access, sptep); + sp = NULL; + if (!is_shadow_present_pte(*sptep)) { + table_gfn = gw->table_gfn[level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + false, access, sptep); + } /* * Verify that the gpte in the page we've just write @@ -357,7 +370,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, level - 1)) goto out_gpte_changed; - link_shadow_page(sptep, sp); + if (sp) + link_shadow_page(sptep, sp); } for (; @@ -392,7 +406,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, return sptep; out_gpte_changed: - kvm_mmu_put_page(sp, sptep); + if (sp) + kvm_mmu_put_page(sp, sptep); kvm_release_pfn_clean(pfn); return NULL; } -- cgit From 24157aaf833261e68e5a398fa54bd15e4fa1d0b7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:11 +0300 Subject: KVM: MMU: Eliminate redundant temporaries in FNAME(fetch) 'level' and 'sptep' are aliases for 'interator.level' and 'iterator.sptep', no need for them. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 59 +++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8cb85f9c8adb..d9a2742014e3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -320,12 +320,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, { unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - u64 *sptep = NULL; - int uninitialized_var(level); bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); int top_level; unsigned direct_access; - struct kvm_shadow_walk_iterator iterator; + struct kvm_shadow_walk_iterator it; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; @@ -346,68 +344,59 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - for (shadow_walk_init(&iterator, vcpu, addr); - shadow_walk_okay(&iterator) && iterator.level > gw->level; - shadow_walk_next(&iterator)) { + for (shadow_walk_init(&it, vcpu, addr); + shadow_walk_okay(&it) && it.level > gw->level; + shadow_walk_next(&it)) { gfn_t table_gfn; - level = iterator.level; - sptep = iterator.sptep; - - drop_large_spte(vcpu, sptep); + drop_large_spte(vcpu, it.sptep); sp = NULL; - if (!is_shadow_present_pte(*sptep)) { - table_gfn = gw->table_gfn[level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - false, access, sptep); + if (!is_shadow_present_pte(*it.sptep)) { + table_gfn = gw->table_gfn[it.level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, + false, access, it.sptep); } /* * Verify that the gpte in the page we've just write * protected is still there. */ - if (FNAME(gpte_changed)(vcpu, gw, level - 1)) + if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; if (sp) - link_shadow_page(sptep, sp); + link_shadow_page(it.sptep, sp); } for (; - shadow_walk_okay(&iterator) && iterator.level > hlevel; - shadow_walk_next(&iterator)) { + shadow_walk_okay(&it) && it.level > hlevel; + shadow_walk_next(&it)) { gfn_t direct_gfn; - level = iterator.level; - sptep = iterator.sptep; + validate_direct_spte(vcpu, it.sptep, direct_access); - validate_direct_spte(vcpu, sptep, direct_access); + drop_large_spte(vcpu, it.sptep); - drop_large_spte(vcpu, sptep); - - if (is_shadow_present_pte(*sptep)) + if (is_shadow_present_pte(*it.sptep)) continue; - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1, - true, direct_access, sptep); - link_shadow_page(sptep, sp); + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, + true, direct_access, it.sptep); + link_shadow_page(it.sptep, sp); } - sptep = iterator.sptep; - level = iterator.level; - - mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, - user_fault, write_fault, dirty, ptwrite, level, + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); - return sptep; + return it.sptep; out_gpte_changed: if (sp) - kvm_mmu_put_page(sp, sptep); + kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); return NULL; } -- cgit From c0e0608cb902af1a1fd8d413ec0a07ee1e62c652 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 13 Jul 2010 16:40:23 +0300 Subject: KVM: x86: emulator: inc/dec can have lock prefix Mark inc (0xfe/0 0xff/0) and dec (0xfe/1 0xff/1) as lock prefix capable. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 255473f974ab..b38bd8b92aa6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -345,10 +345,10 @@ static u32 group_table[] = { DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, SrcMem | ModRM | Stack, 0, -- cgit From 68be0803456b3eed33038be5566710ad7648c854 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 14 Jul 2010 19:05:45 +0300 Subject: KVM: x86: never re-execute instruction with enabled tdp With tdp enabled we should get into emulator only when emulating io, so reexecution will always bring us back into emulator. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index eb55ec55125c..689c2c3182ab 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3936,6 +3936,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) { gpa_t gpa; + if (tdp_enabled) + return false; + /* * if emulation was due to access to shadowed page table * and it failed try to unshadow page and re-entetr the -- cgit From 9195c4da26bbf8860e2e7b648dbf4ab465c7933a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 15 Jul 2010 12:24:37 +0300 Subject: KVM: x86: Call mask notifiers from pic If pit delivers interrupt while pic is masking it OS will never do EOI and ack notifier will not be called so when pit will be unmasked no pit interrupts will be delivered any more. Calling mask notifiers solves this issue. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 819b748a33f9..8d10c063d7f2 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -363,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } } else switch (s->init_state) { - case 0: /* normal mode */ + case 0: { /* normal mode */ + u8 imr_diff = s->imr ^ val, + off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (imr_diff & (1 << irq)) + kvm_fire_mask_notifiers( + s->pics_state->kvm, + SELECT_PIC(irq + off), + irq + off, + !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; + } case 1: s->irq_base = val & 0xf8; s->init_state = 2; -- cgit From c19b8bd60e19308d5583ef200ddcc782d85d9543 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 15 Jul 2010 08:51:58 +0800 Subject: KVM: x86 emulator: fix xchg instruction emulation If the destination is a memory operand and the memory cannot map to a valid page, the xchg instruction emulation and locked instruction will not work on io regions and stuck in endless loop. We should emulate exchange as write to fix it. Signed-off-by: Wei Yongjun Acked-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 689c2c3182ab..97aab036dabf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3562,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + goto emul_write; + } kaddr = kmap_atomic(page, KM_USER0); kaddr += offset_in_page(gpa); -- cgit From 6e3e243c3b6e0bbd18c6ce0fbc12bc3fe2d77b34 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 16 Jul 2010 11:52:55 +0200 Subject: KVM: MMU: fix mmu notifier invalidate handler for huge spte The index wasn't calculated correctly (off by one) for huge spte so KVM guest was unstable with transparent hugepages. Signed-off-by: Andrea Arcangeli Reviewed-by: Reviewed-by: Rik van Riel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 36c62f33513f..812770cddc8d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -850,8 +850,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int sh; + + sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); + idx = ((memslot->base_gfn+gfn_offset) >> sh) - + (memslot->base_gfn >> sh); ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); -- cgit From fa1de2bfc0feb7245328ad25fb3e6d5cd2c903b4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:19:51 +0800 Subject: KVM: MMU: add missing reserved bits check in speculative path In the speculative path, we should check guest pte's reserved bits just as the real processor does Reported-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 ++++++++- arch/x86/kvm/paging_tmpl.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 812770cddc8d..d2ea9cabc066 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2697,6 +2697,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } + if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + return; + ++vcpu->kvm->stat.mmu_pte_updated; if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); @@ -2775,6 +2778,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); @@ -2849,6 +2853,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -2896,7 +2901,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (gentry) + if (gentry && + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + & mask.word)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d9a2742014e3..51ef9097960d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -638,8 +638,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (gfn != sp->gfns[i] || - !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + || gfn != sp->gfns[i] || !is_present_gpte(gpte) + || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; if (is_present_gpte(gpte) || !clear_unsync) -- cgit From daa3db693ce925a14b7e17ab6f306dc0e6a5342c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:23:04 +0800 Subject: KVM: MMU: fix broken page accessed tracking with ept enabled In current code, if ept is enabled(shadow_accessed_mask = 0), the page accessed tracking is lost. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2ea9cabc066..9b3b916ebeae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -687,7 +687,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); - if (old_spte & shadow_accessed_mask) + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); @@ -815,7 +815,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, kvm_set_pfn_dirty(spte_to_pfn(*spte)); old_spte = __xchg_spte(spte, new_spte); if (is_shadow_present_pte(old_spte) - && (old_spte & shadow_accessed_mask)) + && (!shadow_accessed_mask || + old_spte & shadow_accessed_mask)) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } -- cgit From 9ed5520dd3c9cb79c25f95fce9c57b87637d0fb7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:25:17 +0800 Subject: KVM: MMU: fix page dirty tracking lost while sync page In sync-page path, if spte.writable is changed, it will lose page dirty tracking, for example: assume spte.writable = 0 in a unsync-page, when it's synced, it map spte to writable(that is spte.writable = 1), later guest write spte.gfn, it means spte.gfn is dirty, then guest changed this mapping to read-only, after it's synced, spte.writable = 0 So, when host release the spte, it detect spte.writable = 0 and not mark page dirty Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b3b916ebeae..a04756a26fe2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1985,6 +1985,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: + if (is_writable_pte(*sptep) && !is_writable_pte(spte)) + kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; @@ -1998,7 +2000,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -2048,15 +2049,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); - } else { - if (was_writable) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); } + kvm_release_pfn_clean(pfn); if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; -- cgit From be233d49ea8c1fde9f4afec378dc2c2f16ab0263 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:27:10 +0800 Subject: KVM: MMU: don't atomicly set spte if it's not present If the old mapping is not present, the spte.a is not lost, so no need atomic operation to set it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a04756a26fe2..9c7fae08291d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -307,9 +307,10 @@ static void update_spte(u64 *sptep, u64 new_spte) { u64 old_spte; - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || + !is_rmap_spte(*sptep)) __set_spte(sptep, new_spte); - } else { + else { old_spte = __xchg_spte(sptep, new_spte); if (old_spte & shadow_accessed_mask) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); -- cgit From e4b502ead259fcf70839414abb7c8cdc3b523f01 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:28:09 +0800 Subject: KVM: MMU: cleanup spte set and accssed/dirty tracking Introduce set_spte_track_bits() to cleanup current code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c7fae08291d..e4b862eb8885 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -679,7 +679,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; u64 old_spte; @@ -692,6 +692,11 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + set_spte_track_bits(sptep, new_spte); rmap_remove(kvm, sptep); } @@ -791,7 +796,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte, old_spte; + u64 *spte, new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -812,13 +817,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(spte_to_pfn(*spte)); - old_spte = __xchg_spte(spte, new_spte); - if (is_shadow_present_pte(old_spte) - && (!shadow_accessed_mask || - old_spte & shadow_accessed_mask)) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + set_spte_track_bits(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } -- cgit From 9a3aad70572c3f4d55e7f09ac4eb313d41d0a484 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:30:18 +0800 Subject: KVM: MMU: using __xchg_spte more smarter Sometimes, atomically set spte is not needed, this patch call __xchg_spte() more smartly Note: if the old mapping's access bit is already set, we no need atomic operation since the access bit is not lost Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e4b862eb8885..0dcc95e09876 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -682,9 +682,14 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; - u64 old_spte; + u64 old_spte = *sptep; + + if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || + old_spte & shadow_accessed_mask) { + __set_spte(sptep, new_spte); + } else + old_spte = __xchg_spte(sptep, new_spte); - old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); -- cgit From 3444d7da1839b851eefedd372978d8a982316c36 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 18:32:38 +0300 Subject: KVM: VMX: Fix host GDT.LIMIT corruption vmx does not restore GDT.LIMIT to the host value, instead it sets it to 64KB. This means host userspace can learn a few bits of host memory. Fix by reloading GDTR when we load other host state. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2fdcc9819f36..27a0222c2946 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -185,6 +185,7 @@ static void kvm_cpu_vmxoff(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; @@ -871,6 +872,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (current_thread_info()->status & TS_USEDFPU) clts(); + load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1379,6 +1381,8 @@ static int hardware_enable(void *garbage) ept_sync_global(); } + store_gdt(&__get_cpu_var(host_gdt)); + return 0; } -- cgit From e8c534ec068af1a0845aceda373a9bfd2de62030 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Tue, 27 Jul 2010 18:53:35 +0200 Subject: x86: Fix keeping track of AMD C1E Accomodate the original C1E-aware idle routine to the different times during boot when the BIOS enables C1E. While at it, remove the synthetic CPUID flag in favor of a single global setting which denotes C1E status on the system. [ hpa: changed c1e_enabled to be a bool; clarified cpu bit 3:21 comment ] Signed-off-by: Michal Schmidt LKML-Reference: <20100727165335.GA11630@aftab> Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner --- arch/x86/include/asm/acpi.h | 2 +- arch/x86/include/asm/cpufeature.h | 2 +- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/process.c | 8 +++++--- 4 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aa2c39d968fc..92091de11113 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -134,7 +134,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + else if (c1e_detected) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 817aa316b180..0b205b8a4308 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -89,7 +89,7 @@ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */ #define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */ #define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */ + /* 21 available, was AMD_C1E */ #define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */ #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */ #define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d85637bb9505..325b7bdbebaa 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -762,6 +762,7 @@ extern void init_c1e_mask(void); extern unsigned long boot_option_idle_override; extern unsigned long idle_halt; extern unsigned long idle_nomwait; +extern bool c1e_detected; /* * on systems with caches, caches must be flashed as the absolute diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 553b02f13094..b944f89c4e6e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,8 +525,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } +bool c1e_detected; +EXPORT_SYMBOL(c1e_detected); + static cpumask_var_t c1e_mask; -static int c1e_detected; void c1e_remove_cpu(int cpu) { @@ -548,12 +550,12 @@ static void c1e_idle(void) u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = 1; + c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } -- cgit From fe96eb404e33b59bb39f7050205f7c56c1c7d686 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 18 Mar 2010 13:53:24 -0400 Subject: x86: Detect whether we should use Xen SWIOTLB. It is paramount that we call pci_xen_swiotlb_detect before pci_swiotlb_detect as both implementations use the 'swiotlb' and 'swiotlb_force' flags. The pci-xen_swiotlb_detect inhibits the swiotlb_force and swiotlb flag so that the native SWIOTLB implementation is not enabled when running under Xen. [since v1 changed two Cc's to Acked-by] Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Jeremy Fitzhardinge Acked-by: FUJITA Tomonori [http://lkml.org/lkml/2010/7/27/374] Cc: Albert Herranz Cc: Ian Campbell Cc: Thomas Gleixner Acked-by: "H. Peter Anvin" [conditional http://lkml.org/lkml/2010/8/2/324] Cc: x86@kernel.org Cc: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8b01dd..9f07cfcbd3a5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -13,6 +13,7 @@ #include #include #include +#include static int forbid_dac __read_mostly; @@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_swiotlb_detect()) + if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) goto out; gart_iommu_hole_init(); @@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); out: + pci_xen_swiotlb_init(); + pci_swiotlb_init(); } @@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); - if (swiotlb) { + if (swiotlb || xen_swiotlb) { printk(KERN_INFO "PCI-DMA: " "Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_print_info(); -- cgit From be783a47214afc5a0aea9dafcbd9f1535ba05e94 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 2 Aug 2010 08:49:34 +0800 Subject: x86, vdso: Unmap vdso pages We mapped vdso pages but never unmapped them and the virtual address is lost after exiting from the function, so unmap vdso pages here. Signed-off-by: Shaohua Li LKML-Reference: <20100802004934.GA2505@sli10-desk.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ac74869b8140..80f23ed483ef 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -67,6 +67,7 @@ static int __init init_vdso_vars(void) *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; #include "vextern.h" #undef VEXTERN + vunmap(vbase); return 0; oom: -- cgit From 22a57f5896df218356bae6203dfaf04bcfd6c88c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 Aug 2010 15:34:44 -0700 Subject: x86, setup: Allow global variables and functions in the decompressor In order for global variables and functions to work in the decompressor, we need to fix up the GOT in assembly code. Signed-off-by: H. Peter Anvin LKML-Reference: <4C57382E.8050501@zytor.com> --- arch/x86/boot/compressed/head_32.S | 13 +++++++++++++ arch/x86/boot/compressed/head_64.S | 13 +++++++++++++ arch/x86/boot/compressed/vmlinux.lds.S | 6 ++++++ 3 files changed, 32 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index f543b70ffae2..67a655a39ce4 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -123,6 +123,19 @@ relocated: shrl $2, %ecx rep stosl +/* + * Adjust our own GOT + */ + leal _got(%ebx), %edx + leal _egot(%ebx), %ecx +1: + cmpl %ecx, %edx + jae 2f + addl %ebx, (%edx) + addl $4, %edx + jmp 1b +2: + /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index faff0dc9c06a..52f85a196fa0 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -279,6 +279,19 @@ relocated: shrq $3, %rcx rep stosq +/* + * Adjust our own GOT + */ + leaq _got(%rip), %rdx + leaq _egot(%rip), %rcx +1: + cmpq %rcx, %rdx + jae 2f + addq %rbx, (%rdx) + addq $8, %rdx + jmp 1b +2: + /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 5ddabceee124..34d047c98284 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -41,6 +41,12 @@ SECTIONS *(.rodata.*) _erodata = . ; } + .got : { + _got = .; + KEEP(*(.got.plt)) + KEEP(*(.got)) + _egot = .; + } .data : { _data = . ; *(.data) -- cgit From f4ed2877b16e8146427306aea8819adac5c88374 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Aug 2010 02:17:31 -0700 Subject: x86, setup: reorganize the early console setup Separate early_serial_console from tty.c This allows for reuse of early_serial_console.c/string.c/printf.c/cmdline.c in boot/compressed/. -v2: according to hpa, don't include string.c etc -v3: compressed/misc.c must have early_serial_base as static, so move it back to tty.c for setup code Signed-off-by: Yinghai Lu LKML-Reference: <4C568D2B.205@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 8 +- arch/x86/boot/boot.h | 35 +++++---- arch/x86/boot/cmdline.c | 6 +- arch/x86/boot/early_serial_console.c | 139 +++++++++++++++++++++++++++++++++++ arch/x86/boot/isdigit.h | 21 ++++++ arch/x86/boot/printf.c | 4 +- arch/x86/boot/tty.c | 136 +--------------------------------- 7 files changed, 186 insertions(+), 163 deletions(-) create mode 100644 arch/x86/boot/early_serial_console.c create mode 100644 arch/x86/boot/isdigit.h (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ec749c2bfdd7..f7cb086b4add 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,10 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o -setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o regs.o string.o tty.o video.o video-mode.o -setup-y += version.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o +setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o +setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o +setup-y += video-mode.o version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 46c4c5c71af7..00cf51cfc2e6 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -200,21 +200,7 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) return diff; } -static inline int isdigit(int ch) -{ - return (ch >= '0') && (ch <= '9'); -} - -static inline int isxdigit(int ch) -{ - if (isdigit(ch)) - return true; - - if ((ch >= 'a') && (ch <= 'f')) - return true; - - return (ch >= 'A') && (ch <= 'F'); -} +#include "isdigit.h" /* Heap -- available for dynamic lists. */ extern char _end[]; @@ -300,8 +286,18 @@ struct biosregs { void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); /* cmdline.c */ -int cmdline_find_option(const char *option, char *buffer, int bufsize); -int cmdline_find_option_bool(const char *option); +int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize); +int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option); +static inline int cmdline_find_option(const char *option, char *buffer, int bufsize) +{ + return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, buffer, bufsize); +} + +static inline int cmdline_find_option_bool(const char *option) +{ + return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option); +} + /* cpu.c, cpucheck.c */ struct cpu_features { @@ -313,6 +309,10 @@ extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); int validate_cpu(void); +/* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); + /* edd.c */ void query_edd(void); @@ -348,7 +348,6 @@ unsigned int atou(const char *s); unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base); /* tty.c */ -void console_init(void); void puts(const char *); void putchar(int); int getchar(void); diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index a1d35634bce0..6b3b6f708c04 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -27,9 +27,8 @@ static inline int myisspace(u8 c) * Returns the length of the argument (regardless of if it was * truncated to fit in the buffer), or -1 on not found. */ -int cmdline_find_option(const char *option, char *buffer, int bufsize) +int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, int bufsize) { - u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; addr_t cptr; char c; int len = -1; @@ -100,9 +99,8 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize) * Returns the position of that option (starts counting with 1) * or 0 on not found */ -int cmdline_find_option_bool(const char *option) +int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option) { - u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; addr_t cptr; char c; int pos = 0, wstart = 0; diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c new file mode 100644 index 000000000000..030f4b93e255 --- /dev/null +++ b/arch/x86/boot/early_serial_console.c @@ -0,0 +1,139 @@ +#include "boot.h" + +#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +#define DEFAULT_BAUD 9600 + +static void early_serial_init(int port, int baud) +{ + unsigned char c; + unsigned divisor; + + outb(0x3, port + LCR); /* 8n1 */ + outb(0, port + IER); /* no interrupt */ + outb(0, port + FCR); /* no fifo */ + outb(0x3, port + MCR); /* DTR + RTS */ + + divisor = 115200 / baud; + c = inb(port + LCR); + outb(c | DLAB, port + LCR); + outb(divisor & 0xff, port + DLL); + outb((divisor >> 8) & 0xff, port + DLH); + outb(c & ~DLAB, port + LCR); + + early_serial_base = port; +} + +static void parse_earlyprintk(void) +{ + int baud = DEFAULT_BAUD; + char arg[32]; + int pos = 0; + int port = 0; + + if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { + char *e; + + if (!strncmp(arg, "serial", 6)) { + port = DEFAULT_SERIAL_PORT; + pos += 6; + } + + if (arg[pos] == ',') + pos++; + + if (!strncmp(arg, "ttyS", 4)) { + static const int bases[] = { 0x3f8, 0x2f8 }; + int idx = 0; + + if (!strncmp(arg + pos, "ttyS", 4)) + pos += 4; + + if (arg[pos++] == '1') + idx = 1; + + port = bases[idx]; + } + + if (arg[pos] == ',') + pos++; + + baud = simple_strtoull(arg + pos, &e, 0); + if (baud == 0 || arg + pos == e) + baud = DEFAULT_BAUD; + } + + if (port) + early_serial_init(port, baud); +} + +#define BASE_BAUD (1843200/16) +static unsigned int probe_baud(int port) +{ + unsigned char lcr, dll, dlh; + unsigned int quot; + + lcr = inb(port + LCR); + outb(lcr | DLAB, port + LCR); + dll = inb(port + DLL); + dlh = inb(port + DLH); + outb(lcr, port + LCR); + quot = (dlh << 8) | dll; + + return BASE_BAUD / quot; +} + +static void parse_console_uart8250(void) +{ + char optstr[64], *options; + int baud = DEFAULT_BAUD; + int port = 0; + + /* + * console=uart8250,io,0x3f8,115200n8 + * need to make sure it is last one console ! + */ + if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) + return; + + options = optstr; + + if (!strncmp(options, "uart8250,io,", 12)) + port = simple_strtoull(options + 12, &options, 0); + else if (!strncmp(options, "uart,io,", 8)) + port = simple_strtoull(options + 8, &options, 0); + else + return; + + if (options && (options[0] == ',')) + baud = simple_strtoull(options + 1, &options, 0); + else + baud = probe_baud(port); + + if (port) + early_serial_init(port, baud); +} + +void console_init(void) +{ + parse_earlyprintk(); + + if (!early_serial_base) + parse_console_uart8250(); +} diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h new file mode 100644 index 000000000000..25e13403193c --- /dev/null +++ b/arch/x86/boot/isdigit.h @@ -0,0 +1,21 @@ +#ifndef BOOT_ISDIGIT_H + +#define BOOT_ISDIGIT_H + +static inline int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + +#endif diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 50e47cdbdddd..cdac91ca55d3 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -34,7 +34,7 @@ static int skip_atoi(const char **s) #define SMALL 32 /* Must be 32 == 0x20 */ #define SPECIAL 64 /* 0x */ -#define do_div(n,base) ({ \ +#define __do_div(n, base) ({ \ int __res; \ __res = ((unsigned long) n) % (unsigned) base; \ n = ((unsigned long) n) / (unsigned) base; \ @@ -83,7 +83,7 @@ static char *number(char *str, long num, int base, int size, int precision, tmp[i++] = '0'; else while (num != 0) - tmp[i++] = (digits[do_div(num, base)] | locase); + tmp[i++] = (digits[__do_div(num, base)] | locase); if (i > precision) precision = i; size -= precision; diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index ff4b27a0fc5e..def2451f46ae 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -15,27 +15,12 @@ #include "boot.h" -#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */ - -static int early_serial_base; +int early_serial_base; #define XMTRDY 0x20 -#define DLAB 0x80 - #define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ #define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -#define DEFAULT_BAUD 9600 /* * These functions are in .inittext so they can be used to signal @@ -152,122 +137,3 @@ int getchar_timeout(void) return 0; /* Timeout! */ } -static void early_serial_init(int port, int baud) -{ - unsigned char c; - unsigned divisor; - - outb(0x3, port + LCR); /* 8n1 */ - outb(0, port + IER); /* no interrupt */ - outb(0, port + FCR); /* no fifo */ - outb(0x3, port + MCR); /* DTR + RTS */ - - divisor = 115200 / baud; - c = inb(port + LCR); - outb(c | DLAB, port + LCR); - outb(divisor & 0xff, port + DLL); - outb((divisor >> 8) & 0xff, port + DLH); - outb(c & ~DLAB, port + LCR); - - early_serial_base = port; - - printf("Early serial console at I/O port 0x%x baud: %d\n", port, baud); -} - -static void parse_earlyprintk(void) -{ - int baud = DEFAULT_BAUD; - char arg[32]; - int pos = 0; - int port = 0; - - if (cmdline_find_option("earlyprintk", arg, sizeof arg) > 0) { - char *e; - - if (!strncmp(arg, "serial", 6)) { - port = DEFAULT_SERIAL_PORT; - pos += 6; - } - - if (arg[pos] == ',') - pos++; - - if (!strncmp(arg, "ttyS", 4)) { - static const int bases[] = { 0x3f8, 0x2f8 }; - int idx = 0; - - if (!strncmp(arg + pos, "ttyS", 4)) - pos += 4; - - if (arg[pos++] == '1') - idx = 1; - - port = bases[idx]; - } - - if (arg[pos] == ',') - pos++; - - baud = simple_strtoull(arg + pos, &e, 0); - if (baud == 0 || arg + pos == e) - baud = DEFAULT_BAUD; - } - - if (port) - early_serial_init(port, baud); -} - -#define BASE_BAUD (1843200/16) -static unsigned int probe_baud(int port) -{ - unsigned char lcr, dll, dlh; - unsigned int quot; - - lcr = inb(port + LCR); - outb(lcr | DLAB, port + LCR); - dll = inb(port + DLL); - dlh = inb(port + DLH); - outb(lcr, port + LCR); - quot = (dlh << 8) | dll; - - return BASE_BAUD / quot; -} - -static void parse_console_uart8250(void) -{ - char optstr[64], *options; - int baud = DEFAULT_BAUD; - int port = 0; - - /* - * console=uart8250,io,0x3f8,115200n8 - * need to make sure it is last one console ! - */ - if (cmdline_find_option("console", optstr, sizeof optstr) <= 0) - return; - - options = optstr; - - if (!strncmp(options, "uart8250,io,", 12)) - port = simple_strtoull(options + 12, &options, 0); - else if (!strncmp(options, "uart,io,", 8)) - port = simple_strtoull(options + 8, &options, 0); - else - return; - - if (options && (options[0] == ',')) - baud = simple_strtoull(options + 1, &options, 0); - else - baud = probe_baud(port); - - if (port) - early_serial_init(port, baud); -} - -void console_init(void) -{ - parse_earlyprintk(); - - if (!early_serial_base) - parse_console_uart8250(); -} -- cgit From 9f242dc10e0c3c1eb32d8c83c18650a35fd7f80d Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 2 Aug 2010 16:10:37 -0700 Subject: x86, vmware: Preset lpj values when on VMware. When running on VMware's platform, we have seen situations where the AP's try to calibrate the lpj values and fail to get good calibration runs becasue of timing issues. As a result delays don't work correctly on all cpus. The solutions is to set preset_lpj value based on the current tsc frequency value. This is similar to what KVM does as well. Signed-off-by: Alok N Kataria LKML-Reference: <1280790637.14933.29.camel@ank32.eng.vmware.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff588445..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -51,7 +51,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } -- cgit From 8fee13a48e4879fba57725f6d9513df4bfa8e9f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Aug 2010 16:21:22 -0700 Subject: x86, setup: enable early console output from the decompressor This enables the decompressor output to be seen on the serial console. Most of the code is shared with the regular boot code. We could add printf to the decompressor if needed, but currently there is no sufficiently compelling user. -v2: define BOOT_BOOT_H to avoid include boot.h -v3: early_serial_base need to be static in misc.c ? -v4: create seperate string.c printf.c cmdline.c early_serial_console.c after hpa's patch that allow global variables in compressed/misc stage -v5: remove printf.c related Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 4 +- arch/x86/boot/compressed/cmdline.c | 21 ++++++++++ arch/x86/boot/compressed/early_serial_console.c | 5 +++ arch/x86/boot/compressed/misc.c | 56 +++++++++++++++---------- arch/x86/boot/compressed/misc.h | 38 +++++++++++++++++ arch/x86/boot/compressed/string.c | 4 ++ arch/x86/boot/main.c | 6 +-- 7 files changed, 105 insertions(+), 29 deletions(-) create mode 100644 arch/x86/boot/compressed/cmdline.c create mode 100644 arch/x86/boot/compressed/early_serial_console.c create mode 100644 arch/x86/boot/compressed/misc.h create mode 100644 arch/x86/boot/compressed/string.c (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fbb47daf2459..0c229551eead 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -4,7 +4,7 @@ # create a compressed vmlinux image from the original vmlinux # -targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o +targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 KBUILD_CFLAGS += -fno-strict-aliasing -fPIC @@ -23,7 +23,7 @@ LDFLAGS_vmlinux := -T hostprogs-y := mkpiggy -$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c new file mode 100644 index 000000000000..cb62f786990d --- /dev/null +++ b/arch/x86/boot/compressed/cmdline.c @@ -0,0 +1,21 @@ +#include "misc.h" + +static unsigned long fs; +static inline void set_fs(unsigned long seg) +{ + fs = seg << 4; /* shift it back */ +} +typedef unsigned long addr_t; +static inline char rdfs8(addr_t addr) +{ + return *((char *)(fs + addr)); +} +#include "../cmdline.c" +int cmdline_find_option(const char *option, char *buffer, int bufsize) +{ + return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, buffer, bufsize); +} +int cmdline_find_option_bool(const char *option) +{ + return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option); +} diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c new file mode 100644 index 000000000000..261e81fb9582 --- /dev/null +++ b/arch/x86/boot/compressed/early_serial_console.c @@ -0,0 +1,5 @@ +#include "misc.h" + +int early_serial_base; + +#include "../early_serial_console.c" diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 51e240779a44..8f7bef8e9fff 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -9,23 +9,7 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -/* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening - */ -#undef CONFIG_PARAVIRT -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif - -#include -#include -#include -#include -#include -#include -#include +#include "misc.h" /* WARNING!! * This code is compiled with -fPIC and it is relocated dynamically @@ -123,15 +107,13 @@ static void error(char *m); /* * This is set up by the setup-routine at boot-time */ -static struct boot_params *real_mode; /* Pointer to real-mode data */ +struct boot_params *real_mode; /* Pointer to real-mode data */ static int quiet; +static int debug; void *memset(void *s, int c, size_t n); void *memcpy(void *dest, const void *src, size_t n); -static void __putstr(int, const char *); -#define putstr(__x) __putstr(0, __x) - #ifdef CONFIG_X86_64 #define memptr long #else @@ -170,7 +152,21 @@ static void scroll(void) vidmem[i] = ' '; } -static void __putstr(int error, const char *s) +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ +static void serial_putchar(int ch) +{ + unsigned timeout = 0xffff; + + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + + outb(ch, early_serial_base + TXR); +} + +void __putstr(int error, const char *s) { int x, y, pos; char c; @@ -179,6 +175,14 @@ static void __putstr(int error, const char *s) if (!error) return; #endif + if (early_serial_base) { + const char *str = s; + while (*str) { + if (*str == '\n') + serial_putchar('\r'); + serial_putchar(*str++); + } + } if (real_mode->screen_info.orig_video_mode == 0 && lines == 0 && cols == 0) @@ -305,8 +309,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, { real_mode = rmode; - if (real_mode->hdr.loadflags & QUIET_FLAG) + if (cmdline_find_option_bool("quiet")) quiet = 1; + if (cmdline_find_option_bool("debug")) + debug = 1; if (real_mode->screen_info.orig_video_mode == 7) { vidmem = (char *) 0xb0000; @@ -319,6 +325,10 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, lines = real_mode->screen_info.orig_video_lines; cols = real_mode->screen_info.orig_video_cols; + console_init(); + if (debug) + putstr("early console in decompress_kernel\n"); + free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h new file mode 100644 index 000000000000..a267849ac1c9 --- /dev/null +++ b/arch/x86/boot/compressed/misc.h @@ -0,0 +1,38 @@ +#ifndef BOOT_COMPRESSED_MISC_H +#define BOOT_COMPRESSED_MISC_H + +/* + * we have to be careful, because no indirections are allowed here, and + * paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening + */ +#undef CONFIG_PARAVIRT +#ifdef CONFIG_X86_32 +#define _ASM_X86_DESC_H 1 +#endif + +#include +#include +#include +#include +#include +#include +#include + +#define BOOT_BOOT_H + +/* misc.c */ +extern struct boot_params *real_mode; /* Pointer to real-mode data */ +void __putstr(int error, const char *s); +#define putstr(__x) __putstr(0, __x) +#define puts(__x) __putstr(0, __x) + +/* cmdline.c */ +int cmdline_find_option(const char *option, char *buffer, int bufsize); +int cmdline_find_option_bool(const char *option); + +/* early_serial_console.c */ +extern int early_serial_base; +void console_init(void); + +#endif diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c new file mode 100644 index 000000000000..7995c6a49509 --- /dev/null +++ b/arch/x86/boot/compressed/string.c @@ -0,0 +1,4 @@ +#include "misc.h" + +#include "../isdigit.h" +#include "../string.c" diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 4ef1a33e8572..40358c8905be 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -132,6 +132,8 @@ void main(void) /* Initialize the early-boot console */ console_init(); + if (cmdline_find_option_bool("debug")) + puts("early console in setup code\n"); /* End of heap check */ init_heap(); @@ -171,10 +173,6 @@ void main(void) /* Set the video mode */ set_video(); - /* Parse command line for 'quiet' and pass it to decompressor. */ - if (cmdline_find_option_bool("quiet")) - boot_params.hdr.loadflags |= QUIET_FLAG; - /* Do the last things and invoke protected mode */ go_to_protected_mode(); } -- cgit From 6238b47b58480cd9c092600c05338dbe261b71ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 Aug 2010 21:03:46 -0700 Subject: x86, setup: move isdigit.h to ctype.h, header files on top. It is a subset of functionality, so name it ctype.h. Also, reorganize header files so #include statements are clustered near the top as they should be. Signed-off-by: H. Peter Anvin LKML-Reference: <4C5752F2.8030206@kernel.org> --- arch/x86/boot/boot.h | 3 +-- arch/x86/boot/compressed/misc.h | 1 + arch/x86/boot/compressed/string.c | 2 -- arch/x86/boot/ctype.h | 21 +++++++++++++++++++++ arch/x86/boot/isdigit.h | 21 --------------------- 5 files changed, 23 insertions(+), 25 deletions(-) create mode 100644 arch/x86/boot/ctype.h delete mode 100644 arch/x86/boot/isdigit.h (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 00cf51cfc2e6..c7093bd9f2d3 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -28,6 +28,7 @@ #include "bitops.h" #include #include +#include "ctype.h" /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -200,8 +201,6 @@ static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) return diff; } -#include "isdigit.h" - /* Heap -- available for dynamic lists. */ extern char _end[]; extern char *HEAP; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index a267849ac1c9..3f19c81a6203 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -20,6 +20,7 @@ #include #define BOOT_BOOT_H +#include "../ctype.h" /* misc.c */ extern struct boot_params *real_mode; /* Pointer to real-mode data */ diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c index 7995c6a49509..19b3e693cd72 100644 --- a/arch/x86/boot/compressed/string.c +++ b/arch/x86/boot/compressed/string.c @@ -1,4 +1,2 @@ #include "misc.h" - -#include "../isdigit.h" #include "../string.c" diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h new file mode 100644 index 000000000000..25e13403193c --- /dev/null +++ b/arch/x86/boot/ctype.h @@ -0,0 +1,21 @@ +#ifndef BOOT_ISDIGIT_H + +#define BOOT_ISDIGIT_H + +static inline int isdigit(int ch) +{ + return (ch >= '0') && (ch <= '9'); +} + +static inline int isxdigit(int ch) +{ + if (isdigit(ch)) + return true; + + if ((ch >= 'a') && (ch <= 'f')) + return true; + + return (ch >= 'A') && (ch <= 'F'); +} + +#endif diff --git a/arch/x86/boot/isdigit.h b/arch/x86/boot/isdigit.h deleted file mode 100644 index 25e13403193c..000000000000 --- a/arch/x86/boot/isdigit.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef BOOT_ISDIGIT_H - -#define BOOT_ISDIGIT_H - -static inline int isdigit(int ch) -{ - return (ch >= '0') && (ch <= '9'); -} - -static inline int isxdigit(int ch) -{ - if (isdigit(ch)) - return true; - - if ((ch >= 'a') && (ch <= 'f')) - return true; - - return (ch >= 'A') && (ch <= 'F'); -} - -#endif -- cgit From 35f2915c3bd0cd6950bdd9d461de565e8feae852 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 1 Jun 2010 13:07:34 +0100 Subject: intel_scu_ipc: add definitions for vRTC related command Signed-off-by: Feng Tang Signed-off-by: Alan Cox Signed-off-by: Matthew Garrett --- arch/x86/include/asm/intel_scu_ipc.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 4470c9ad4a3e..03200452069b 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -1,6 +1,12 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ +#define IPCMSG_VRTC 0xFA /* Set vRTC device */ + +/* Command id associated with message IPCMSG_VRTC */ +#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ +#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ + /* Read single register */ int intel_scu_ipc_ioread8(u16 addr, u8 *data); -- cgit From 804f8681a99da2aa49bd7f0dab3750848d1ab1bc Mon Sep 17 00:00:00 2001 From: Sreedhara DS Date: Mon, 26 Jul 2010 10:03:10 +0100 Subject: Remove indirect read write api support. The firmware of production devices does not support this interface so this is dead code. Signed-off-by: Sreedhara DS Signed-off-by: Alan Cox Signed-off-by: Matthew Garrett --- arch/x86/include/asm/intel_scu_ipc.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 03200452069b..29f66793cc55 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -34,20 +34,6 @@ int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); /* Update single register based on the mask */ int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); -/* - * Indirect register read - * Can be used when SCCB(System Controller Configuration Block) register - * HRIM(Honor Restricted IPC Messages) is set (bit 23) - */ -int intel_scu_ipc_register_read(u32 addr, u32 *data); - -/* - * Indirect register write - * Can be used when SCCB(System Controller Configuration Block) register - * HRIM(Honor Restricted IPC Messages) is set (bit 23) - */ -int intel_scu_ipc_register_write(u32 addr, u32 data); - /* Issue commands to the SCU with or without data */ int intel_scu_ipc_simple_command(int cmd, int sub); int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, -- cgit From 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 18 May 2010 13:59:05 +0200 Subject: x86, mce: Notify about corrected events too Notify all parties registered on the mce decoder chain about logged correctable MCEs. Signed-off-by: Borislav Petkov Acked-by: Doug Thompson Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..1970ef911c99 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); add_taint(TAINT_MACHINE_CHECK); } -- cgit From 5d77b85458f656923b85291a4ff56ed44859ed52 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 20 Jul 2010 13:52:00 -0400 Subject: [CPUFREQ] pcc driver should check for pcch method before calling _OSC The pcc specification documents an _OSC method that's incompatible with the one defined as part of the ACPI spec. This shouldn't be a problem as both are supposed to be guarded with a UUID. Unfortunately approximately nobody (including HP, who wrote this spec) properly check the UUID on entry to the _OSC call. Right now this could result in surprising behaviour if the pcc driver performs an _OSC call on a machine that doesn't implement the pcc specification. Check whether the PCCH method exists first in order to reduce this probability. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ce7cde713e71..01bd25c3c7ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void) struct pcc_memory_resource *mem_resource; struct pcc_register_resource *reg_resource; union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle; + acpi_handle handle, osc_handle, pcch_handle; int ret = 0; status = acpi_get_handle(NULL, "\\_SB", &handle); if (ACPI_FAILURE(status)) return -ENODEV; + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + status = acpi_get_handle(handle, "_OSC", &osc_handle); if (ACPI_SUCCESS(status)) { ret = pcc_cpufreq_do_osc(&osc_handle); -- cgit From 0d9715d64fe118dd0957a29e344972b8d3f960e7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 23 Jul 2010 23:06:52 +0100 Subject: [CPUFREQ] fix double freeing in error path of pcc-cpufreq Prevent double freeing on error path. Signed-off-by: Daniel J Blueman Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 01bd25c3c7ca..900702888bfb 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } + if (out_obj->type != ACPI_TYPE_BUFFER) + return -ENODEV; errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } + if (errors) + return -ENODEV; supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } + if (!(supported & 0x1)) + return -ENODEV; out_free: kfree(output.pointer); -- cgit From 6ebdf777ba034d2b54c99f28a4b18dabf286d8e5 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 15 Jul 2010 11:44:00 -0400 Subject: [CPUFREQ] Fix PCC driver error path The PCC cpufreq driver unmaps the mailbox address range if any CPUs fail to initialise, but doesn't do anything to remove the registered CPUs from the cpufreq core resulting in failures further down the line. We're better off simply returning a failure - the cpufreq core will unregister us cleanly if we end up with no successfully registered CPUs. Tidy up the failure path and also add a sanity check to ensure that the firmware gives us a realistic frequency - the core deals badly with that being set to 0. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 900702888bfb..a36de5bbb622 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -541,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!pcch_virt_addr) { result = -1; - goto pcch_null; + goto out; } result = pcc_get_offset(cpu); if (result) { dprintk("init: PCCP evaluation failed\n"); - goto free; + goto out; } policy->max = policy->cpuinfo.max_freq = @@ -556,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) ioread32(&pcch_hdr->minimum_frequency) * 1000; policy->cur = pcc_get_freq(cpu); + if (!policy->cur) { + dprintk("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + dprintk("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); - - return 0; -free: - pcc_clear_mapping(); - free_percpu(pcc_cpu_info); -pcch_null: +out: return result; } -- cgit From c2f4a2c6e08c7635316dfd25ef706e9104384c56 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 17:55:30 +0200 Subject: [CPUFREQ] powernow-k8: Limit Pstate transition latency check The Pstate transition latency check was added for broken F10h BIOSen which wrongly contain a value of 0 for transition and bus master latency. Fam11h and later, however, (will) have similar transition latency so extend that behavior for them too. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e6..3e90cce3dc8b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1023,13 +1023,12 @@ static int get_transition_latency(struct powernow_k8_data *data) } if (max_latency == 0) { /* - * Fam 11h always returns 0 as transition latency. - * This is intended and means "very fast". While cpufreq core - * and governors currently can handle that gracefully, better - * set it to 1 to avoid problems in the future. - * For all others it's a BIOS bug. + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. */ - if (boot_cpu_data.x86 != 0x11) + if (boot_cpu_data.x86 < 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; -- cgit From 298decfbc44e9a4cb7862ae1b7dfc4e1ba3551b9 Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Wed, 20 Jan 2010 19:19:33 +0200 Subject: [CPUFREQ] powernow-k8: On load failure, remind the user to enable support in BIOS setup On Wed, 2010-01-20 at 16:56 +0100, Thomas Renninger wrote: > But most often this happens if people upgrade their CPU and do not > update their BIOS. > Or the vendor does not recognise the new CPU even if the BIOS got > updated. Maybe some of those people just didn't realize it was disabled in BIOS? If you tell users that it's a firmware bug then they'll probably just give up. > The itself message might be an enhancment, IMO it's not worth a patch. Why do you think so? I spent an hour on hunting down the BIOS upgrade, only to find that it didn't improve anything. It was a day later that I realized that it might be a BIOS option; and the option was literally the _last_ option in the whole BIOS setup. :) This message would have saved the day. > But do not revert the FW_BUG part! Sure, you have a point here. How about this patch? --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3e90cce3dc8b..c48b44b3b43a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) * www.amd.com */ printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); return -ENODEV; } -- cgit From 6b72e3934b42930fd40fc42fe762d21be413301c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:35 +0200 Subject: [CPUFREQ] acpi-cpufreq: Fix CPU_ANY CPUFREQ_{PRE,POST}CHANGE notification Signed-off-by: Thomas Renninger CC: venki@google.com CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40ee..cee7aa949c35 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -351,7 +351,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -367,7 +367,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } -- cgit From 6f4f2723d08534fd4e407e1ef8500b0f4d12c30c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:36 +0200 Subject: [CPUFREQ] x86 cpufreq: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id Signed-off-by: Thomas Renninger CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org CC: robert.schoene@tu-dresden.de Tested-by: robert.schoene@tu-dresden.de Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cee7aa949c35..246cd3afbb5f 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..787572d43d9c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit From ccc5638a20b0eb3a66666d9d4dd8fe8f5ad40386 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 20:03:55 +0400 Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e3..8c3325fee77c 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -199,7 +199,7 @@ static __init struct pci_dev *gx_detect_chipset(void) } /* detect which companion chip is used */ - while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + for_each_pci_dev(gx_pci) { if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) return gx_pci; } -- cgit From 55c789bb2bcdcaa8f1f60687b4a9dbd02ffddd88 Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Thu, 15 Jul 2010 20:36:41 +0200 Subject: [CPUFREQ] Convert pci_table entries to PCI_VDEVICE (if PCI_ANY_ID is used) This patch converts pci_table entries, where .subvendor=PCI_ANY_ID and .subdevice=PCI_ANY_ID, .class=0 and .class_mask=0, to use the PCI_VDEVICE macro, and thus improves readability. Signed-off-by: Peter Huewe Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 8c3325fee77c..32974cf84232 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { * Low Level chipset interface * ****************************************************************/ static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, - PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, { 0, }, }; -- cgit From b30d3304c9c068ccfe6940232834768af75f8c9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 18:05:14 +0200 Subject: [CPUFREQ] powernow-k8: Fix misleading variable naming rdmsr() takes the lower 32 bits as a second argument and the high 32 as a third. Fix the names accordingly since they were swapped. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c48b44b3b43a..90cab2d4ac0d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -912,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, { int i; u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); - data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; -- cgit From 7e2d81122052c83feeddbebf706b6d53fba7996d Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:28:49 +0800 Subject: [CPUFREQ] Fix section mismatch for longrun_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longrun.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c52..fc09f142d94d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) +static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) { u32 msr_lo, msr_hi; u32 save_lo, save_hi; @@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, } -static int __init longrun_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) { int result = 0; -- cgit From 2530573e45c5846cd238db78651f0d236fc78aab Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:03 +0800 Subject: [CPUFREQ] Fix section mismatch for longhaul_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. Use the __cpuinitdata for data used by the routines marked as __cpuinit. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/longhaul.h | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f8261..03162dac6271 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -426,7 +426,7 @@ static int guess_fsb(int mult) } -static int __init longhaul_get_ranges(void) +static int __cpuinit longhaul_get_ranges(void) { unsigned int i, j, k = 0; unsigned int ratio; @@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) } -static void __init longhaul_setup_voltagescaling(void) +static void __cpuinit longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; struct mV_pos minvid, maxvid, vid; @@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) return 0; } -static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname = NULL; diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f79..cbf48fbca881 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h @@ -56,7 +56,7 @@ union msr_longhaul { /* * VIA C3 Samuel 1 & Samuel 2 (stepping 0) */ -static const int __initdata samuel1_mults[16] = { +static const int __cpuinitdata samuel1_mults[16] = { -1, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { -1, /* 1111 -> RESERVED */ }; -static const int __initdata samuel1_eblcr[16] = { +static const int __cpuinitdata samuel1_eblcr[16] = { 50, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { /* * VIA C3 Samuel2 Stepping 1->15 */ -static const int __initdata samuel2_eblcr[16] = { +static const int __cpuinitdata samuel2_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { /* * VIA C3 Ezra */ -static const int __initdata ezra_mults[16] = { +static const int __cpuinitdata ezra_mults[16] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { 120, /* 1111 -> 12.0x */ }; -static const int __initdata ezra_eblcr[16] = { +static const int __cpuinitdata ezra_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { /* * VIA C3 (Ezra-T) [C5M]. */ -static const int __initdata ezrat_mults[32] = { +static const int __cpuinitdata ezrat_mults[32] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { -1, /* 1111 -> RESERVED (12.0x) */ }; -static const int __initdata ezrat_eblcr[32] = { +static const int __cpuinitdata ezrat_eblcr[32] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static const int __initdata nehemiah_mults[32] = { +static const int __cpuinitdata nehemiah_mults[32] = { 100, /* 0000 -> 10.0x */ -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { -1, /* 1111 -> 12.0x */ }; -static const int __initdata nehemiah_eblcr[32] = { +static const int __cpuinitdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -315,7 +315,7 @@ struct mV_pos { unsigned short pos; }; -static const struct mV_pos __initdata vrm85_mV[32] = { +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, @@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} }; -static const unsigned char __initdata mV_vrm85[32] = { +static const unsigned char __cpuinitdata mV_vrm85[32] = { 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 }; -static const struct mV_pos __initdata mobilevrm_mV[32] = { +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, @@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static const unsigned char __initdata mV_mobilevrm[32] = { +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, -- cgit From 307069cf6c53632adc27de4f49bf5d1d67cb87bb Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:16 +0800 Subject: [CPUFREQ] Fix section mismatch for powernow_cpu_init in powernow-k7.c Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e5..4a45fd6e41ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) * We will then get the same kind of behaviour already tested under * the "well-known" other OS. */ -static int __init fixup_sgtc(void) +static int __cpuinit fixup_sgtc(void) { unsigned int sgtc; unsigned int m; @@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(const struct dmi_system_id *d) +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING PFX "%s laptop with broken PST tables in BIOS detected.\n", @@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) * A BIOS update is all that can save them. * Mention this, and disable cpufreq. */ -static struct dmi_system_id __initdata powernow_dmi_table[] = { +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { { .callback = acer_cpufreq_pst, .ident = "Acer Aspire", @@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { { } }; -static int __init powernow_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) { union msr_fidvidstatus fidvidstatus; int result; -- cgit From 9d1f44ee206a23b975d7d7c6f759efb25e0e61ac Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 3 Aug 2010 13:47:30 -0400 Subject: [CPUFREQ] Remove pointless printk from p4-clockmod. The only machines this is triggering on should be supported by acpi-cpufreq or acpi's internal throttling. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b07..bd1cac747f67 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ -- cgit From cb84b19474384c572ba3aa2345815e555112ebf5 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:43 -0700 Subject: x86, hwmon: Package Level Thermal/Power: pkgtemp hwmon driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a hwmon driver for package level thermal control. The driver dumps package level thermal information through sysfs interface so that upper level application (e.g. lm_sensor) can retrive the information. Instead of having the package level hwmon code in coretemp, I write a seperate driver pkgtemp because: First, package level thermal sensors include not only sensors for each core, but also sensors for uncore, memory controller or other components in the package. Logically it will be clear to have a seperate hwmon driver for package level hwmon to monitor wider range of sensors in a package. Merging package thermal driver into core thermal driver doesn't make sense and may mislead. Secondly, merging the two drivers together may cause coding mess. It's easier to include various package level sensors info if more sensor information is implemented. Coretemp code needs to consider a lot of legacy machine cases. Pkgtemp code only considers platform starting from Sandy Bridge. On a 1Sx4Cx2T Sandy Bridge platform, lm-sensors dumps the pkgtemp and coretemp: pkgtemp-isa-0000 Adapter: ISA adapter physical id 0: +33.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0000 Adapter: ISA adapter Core 0: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0001 Adapter: ISA adapter Core 1: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0002 Adapter: ISA adapter Core 2: +32.0°C (high = +79.0°C, crit = +99.0°C) coretemp-isa-0003 Adapter: ISA adapter Core 3: +32.0°C (high = +79.0°C, crit = +99.0°C) [ hpa: folded v3 patch removing improper global variable "SHOW" ] Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-3-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 1 + arch/x86/configs/x86_64_defconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index d28fad19654a..e3a32431ca1e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1471,6 +1471,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 6c86acd847a4..4251f8372050 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1456,6 +1456,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set -- cgit From 55d435a227bd28c77afab326de44dfacc0b15059 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:44 -0700 Subject: x86, hwmon: Package Level Thermal/Power: thermal throttling handler Add package level thermal throttle interrupt support. The interrupt handler increases package level thermal throttle count. It also logs the event in MCE log. The package level thermal throttle interrupt happens across threads in a package. Each thread handles the interrupt individually. User level application is supposed to retrieve correct event count and log based on package/thread topology. This is the same situation for core level interrupt handler. In the future, interrupt may be reported only per package or per core. core_throttle_count and package_throttle_count are used for user interface. Previously only throttle_count is used for core throttle count. If you think new core_throttle_count name breaks user interface, I can change this part. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-4-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 +++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf9716..d307f9f64c23 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -37,7 +37,7 @@ /* * Current thermal throttling state: */ -struct thermal_state { +struct _thermal_state { bool is_throttled; u64 next_check; @@ -45,6 +45,11 @@ struct thermal_state { unsigned long last_throttle_count; }; +struct thermal_state { + struct _thermal_state core; + struct _thermal_state package; +}; + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(level, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##level##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name( \ ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).level.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_one_ro(package_throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; @@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 +static int therm_throt_process(bool is_throttled, int level) { - struct thermal_state *state; + struct _thermal_state *state; unsigned int this_cpu; bool was_throttled; u64 now; this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) + state = &per_cpu(thermal_state, this_cpu).core; + else + state = &per_cpu(thermal_state, this_cpu).package; was_throttled = state->is_throttled; state->is_throttled = is_throttled; @@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled) /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled) /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PTS)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_throttle_attr_group.name); + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device); static void intel_thermal_interrupt(void) { __u64 msr_val; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + CORE_LEVEL) != 0) mce_log_therm_throt_event(msr_val); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + PACKAGE_LEVEL) != 0) + /* + * Set up the most significant bit to notify mce log + * that this thermal event is a package level event. + * This is a temp solution. May be changed in the future + * with mce log infrasture. + */ + mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + } } static void unexpected_thermal_interrupt(void) @@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); -- cgit From 0199114c31798af5b83841b21759b64171060d9b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:45 -0700 Subject: x86, hwmon: Package Level Thermal/Power: power limit Power limit notification feature is published in Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification. It is implemented first on Intel Sandy Bridge platform. The patch handles notification interrupt. Interrupt handler dumps power limit information in log_buf, logs the event in mce log, and increases the event counters (core_power_limit and package_power_limit). Upper level applications could use the data to detect system health or diagnose functionality/performance issues. In the future, the event could be handled in a more fancy way. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-5-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 183 ++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d307f9f64c23..c2a8b26d4fea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,25 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ struct _thermal_state { - bool is_throttled; - + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; }; struct thermal_state { - struct _thermal_state core; - struct _thermal_state package; + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly; therm_throt_sysdev_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(level, name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##level##_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).level.name); \ + per_cpu(thermal_state, cpu).event.name); \ } else \ ret = 0; \ preempt_enable(); \ @@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); define_therm_throt_sysdev_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); + static struct attribute *thermal_throttle_attrs[] = { &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -116,49 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 -static int therm_throt_process(bool is_throttled, int level) +static int therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - if (level == CORE_LEVEL) - state = &per_cpu(thermal_state, this_cpu).core; - else - state = &per_cpu(thermal_state, this_cpu).package; + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->throttle_count); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + else + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); add_taint(TAINT_MACHINE_CHECK); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + else + printk(KERN_INFO "CPU%d: %s power limit normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -172,21 +207,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) int err; struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) return err; + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, - thermal_throttle_attr_group.name); + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -257,6 +300,17 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { @@ -264,21 +318,31 @@ static void intel_thermal_interrupt(void) struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) - /* - * Set up the most significant bit to notify mce log - * that this thermal event is a package level event. - * This is a temp solution. May be changed in the future - * with mce log infrasture. - */ - mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); } } @@ -381,14 +445,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); } smp_thermal_vector = intel_thermal_interrupt; -- cgit From 8a22b9996b001c88f2bfb54c6de6a05fc39e177a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 12 Jul 2010 11:49:59 -0700 Subject: xen: drop xen_sched_clock in favour of using plain wallclock time xen_sched_clock only counts unstolen time. In principle this should be useful to the Linux scheduler so that it knows how much time a process actually consumed. But in practice this doesn't work very well as the scheduler expects the sched_clock time to be synchronized between cpus. It also uses sched_clock to measure the time a task spends sleeping, in which case "unstolen time" isn't meaningful. So just use plain xen_clocksource_read to return wallclock nanoseconds for sched_clock. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/time.c | 39 --------------------------------------- 2 files changed, 1 insertion(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 399bed2de881..fef034a04c24 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -926,7 +926,7 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, + .sched_clock = xen_clocksource_read, }; static const struct pv_cpu_ops xen_cpu_ops __initdata = { diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 32764b8880b5..e90360ff4a08 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -155,45 +155,6 @@ static void do_stolen_accounting(void) account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { -- cgit From c06ee78d73fd24e8d8a65f16380f6a0551107e1b Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 19 Jul 2010 10:25:08 -0700 Subject: xen: support large numbers of CPUs with vcpu info placement When vcpu info placement is supported, we're not limited to MAX_VIRT_CPUS vcpus. However, if it isn't supported, then ignore any excess vcpus. Signed-off-by: Mukesh Rathor Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fef034a04c24..90a3e8026767 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -97,6 +97,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +static void clamp_max_cpus(void) +{ +#ifdef CONFIG_SMP + if (setup_max_cpus > MAX_VIRT_CPUS) + setup_max_cpus = MAX_VIRT_CPUS; +#endif +} + static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; @@ -104,13 +112,17 @@ static void xen_vcpu_setup(int cpu) struct vcpu_info *vcpup; BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - if (!have_vcpu_info_placement) - return; /* already tested, not available */ + if (cpu < MAX_VIRT_CPUS) + per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - vcpup = &per_cpu(xen_vcpu_info, cpu); + if (!have_vcpu_info_placement) { + if (cpu >= MAX_VIRT_CPUS) + clamp_max_cpus(); + return; + } + vcpup = &per_cpu(xen_vcpu_info, cpu); info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); @@ -125,6 +137,7 @@ static void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); have_vcpu_info_placement = 0; + clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if later ones fail to. */ -- cgit From f09f6d194d85043e0eb105a577e7ad6d8170ab66 Mon Sep 17 00:00:00 2001 From: Donald Dutile Date: Thu, 15 Jul 2010 14:56:49 -0400 Subject: Xen: register panic notifier to take crashes of xen guests on panic Register a panic notifier so that when the guest crashes it can shut down the domain and indicate it was a crash to the host. Signed-off-by: Donald Dutile Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 20 ++++++++++++++++++++ arch/x86/xen/setup.c | 2 ++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 90a3e8026767..d99522e8f033 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1040,6 +1040,26 @@ static void xen_crash_shutdown(struct pt_regs *regs) xen_reboot(SHUTDOWN_crash); } +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct sched_shutdown r = { .reason = SHUTDOWN_crash}; + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) + BUG(); + return NOTIFY_DONE; +} + +static struct notifier_block xen_panic_block = { + .notifier_call= xen_panic_event, +}; + +int xen_panic_handler_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + return 0; +} + static const struct machine_ops __initdata xen_machine_ops = { .restart = xen_restart, .halt = xen_machine_halt, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9deb6bab6c78..328b00305426 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -226,6 +226,8 @@ void __init xen_arch_setup(void) struct physdev_set_iopl set_iopl; int rc; + xen_panic_handler_init(); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f9153a300bce..00d59d608edf 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -101,4 +101,6 @@ void xen_sysret32(void); void xen_sysret64(void); void xen_adjust_exception_frame(void); +extern int xen_panic_handler_init(void); + #endif /* XEN_OPS_H */ -- cgit From 086748e52fb072ff0935ba4512e29c421bd5b716 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 3 Aug 2010 14:55:14 -0700 Subject: xen/panic: use xen_reboot and fix smp_send_stop Offline vcpu when using stop_self. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 5 +---- arch/x86/xen/smp.c | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d99522e8f033..3c4da8bee06f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1043,10 +1043,7 @@ static void xen_crash_shutdown(struct pt_regs *regs) static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct sched_shutdown r = { .reason = SHUTDOWN_crash}; - - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) - BUG(); + xen_reboot(SHUTDOWN_crash); return NOTIFY_DONE; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a29693fd3138..25f232b18a82 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -394,6 +394,8 @@ static void stop_self(void *v) load_cr3(swapper_pg_dir); /* should set up a minimal gdt */ + set_cpu_online(cpu, false); + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); BUG(); } -- cgit From 12bfa3de63504d879ae427ec1f2884fc46556157 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 5 Aug 2010 09:22:20 -0500 Subject: kgdb,x86: Individual register get/set for x86 Implement the ability to individually get and set registers for kdb and kgdb for x86. Signed-off-by: Jason Wessel Acked-by: H. Peter Anvin CC: Ingo Molnar CC: x86@kernel.org --- arch/x86/include/asm/kgdb.h | 20 +++--- arch/x86/kernel/kgdb.c | 168 ++++++++++++++++++++++---------------------- 2 files changed, 94 insertions(+), 94 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 006da3687cdc..396f5b5fc4d7 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -39,9 +39,11 @@ enum regnames { GDB_FS, /* 14 */ GDB_GS, /* 15 */ }; +#define GDB_ORIG_AX 41 +#define DBG_MAX_REG_NUM 16 #define NUMREGBYTES ((GDB_GS+1)*4) #else /* ! CONFIG_X86_32 */ -enum regnames64 { +enum regnames { GDB_AX, /* 0 */ GDB_BX, /* 1 */ GDB_CX, /* 2 */ @@ -59,15 +61,15 @@ enum regnames64 { GDB_R14, /* 14 */ GDB_R15, /* 15 */ GDB_PC, /* 16 */ + GDB_PS, /* 17 */ + GDB_CS, /* 18 */ + GDB_SS, /* 19 */ }; - -enum regnames32 { - GDB_PS = 34, - GDB_CS, - GDB_SS, -}; -#define NUMREGBYTES ((GDB_SS+1)*4) -#endif /* CONFIG_X86_32 */ +#define GDB_ORIG_AX 57 +#define DBG_MAX_REG_NUM 20 +/* 17 64 bit regs and 3 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17ae2ae7..bae89825e14e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -49,55 +49,94 @@ #include #include -/** - * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs - * @gdb_regs: A pointer to hold the registers in the order GDB wants. - * @regs: The &struct pt_regs of the current process. - * - * Convert the pt_regs in @regs into the format for registers that - * GDB expects, stored in @gdb_regs. - */ -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, + { "fs", 4, -1 }, + { "gs", 4, -1 }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, #endif - gdb_regs[GDB_AX] = regs->ax; - gdb_regs[GDB_BX] = regs->bx; - gdb_regs[GDB_CX] = regs->cx; - gdb_regs[GDB_DX] = regs->dx; - gdb_regs[GDB_SI] = regs->si; - gdb_regs[GDB_DI] = regs->di; - gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PC] = regs->ip; +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( #ifdef CONFIG_X86_32 - gdb_regs[GDB_PS] = regs->flags; - gdb_regs[GDB_DS] = regs->ds; - gdb_regs[GDB_ES] = regs->es; - gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_FS] = 0xFFFF; - gdb_regs[GDB_GS] = 0xFFFF; - if (user_mode_vm(regs)) { - gdb_regs[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; - } else { - gdb_regs[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; } -#else - gdb_regs[GDB_R8] = regs->r8; - gdb_regs[GDB_R9] = regs->r9; - gdb_regs[GDB_R10] = regs->r10; - gdb_regs[GDB_R11] = regs->r11; - gdb_regs[GDB_R12] = regs->r12; - gdb_regs[GDB_R13] = regs->r13; - gdb_regs[GDB_R14] = regs->r14; - gdb_regs[GDB_R15] = regs->r15; - gdb_regs32[GDB_PS] = regs->flags; - gdb_regs32[GDB_CS] = regs->cs; - gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + switch (regno) { +#ifdef CONFIG_X86_32 + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; #endif + } + return dbg_reg_def[regno].name; } /** @@ -150,47 +189,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_SP] = p->thread.sp; } -/** - * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. - * @gdb_regs: A pointer to hold the registers we've received from GDB. - * @regs: A pointer to a &struct pt_regs to hold these values in. - * - * Convert the GDB regs in @gdb_regs into the pt_regs, and store them - * in @regs. - */ -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; -#endif - regs->ax = gdb_regs[GDB_AX]; - regs->bx = gdb_regs[GDB_BX]; - regs->cx = gdb_regs[GDB_CX]; - regs->dx = gdb_regs[GDB_DX]; - regs->si = gdb_regs[GDB_SI]; - regs->di = gdb_regs[GDB_DI]; - regs->bp = gdb_regs[GDB_BP]; - regs->ip = gdb_regs[GDB_PC]; -#ifdef CONFIG_X86_32 - regs->flags = gdb_regs[GDB_PS]; - regs->ds = gdb_regs[GDB_DS]; - regs->es = gdb_regs[GDB_ES]; - regs->cs = gdb_regs[GDB_CS]; -#else - regs->r8 = gdb_regs[GDB_R8]; - regs->r9 = gdb_regs[GDB_R9]; - regs->r10 = gdb_regs[GDB_R10]; - regs->r11 = gdb_regs[GDB_R11]; - regs->r12 = gdb_regs[GDB_R12]; - regs->r13 = gdb_regs[GDB_R13]; - regs->r14 = gdb_regs[GDB_R14]; - regs->r15 = gdb_regs[GDB_R15]; - regs->flags = gdb_regs32[GDB_PS]; - regs->cs = gdb_regs32[GDB_CS]; - regs->ss = gdb_regs32[GDB_SS]; -#endif -} - static struct hw_breakpoint { unsigned enabled; unsigned long addr; -- cgit From 9264b278be42c031dc76517a0d4bb154f5dcf470 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 5 Aug 2010 09:22:24 -0500 Subject: KGDB: Remove set but unused newPC Found by gcc 4.6's new warnings Signed-off-by: Andi Kleen Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bae89825e14e..a8b80979ceb8 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -456,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, { unsigned long addr; char *ptr; - int newPC; switch (remcomInBuffer[0]) { case 'c': @@ -467,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, linux_regs->ip = addr; case 'D': case 'k': - newPC = linux_regs->ip; - /* clear the trace bit */ linux_regs->flags &= ~X86_EFLAGS_TF; atomic_set(&kgdb_cpu_doing_single_step, -1); -- cgit From df4939350b345ebb44937902827aa75b8ad4998c Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Thu, 5 Aug 2010 09:22:25 -0500 Subject: kgdb,x86: use macro HBP_NUM to replace magic number 4 Use the macros provided by the HW breakpoint API. Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a8b80979ceb8..ef10940e1af0 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -195,7 +195,7 @@ static struct hw_breakpoint { int len; int type; struct perf_event **pev; -} breakinfo[4]; +} breakinfo[HBP_NUM]; static unsigned long early_dr7; @@ -203,7 +203,7 @@ static void kgdb_correct_hw_break(void) { int breakno; - for (breakno = 0; breakno < 4; breakno++) { + for (breakno = 0; breakno < HBP_NUM; breakno++) { struct perf_event *bp; struct arch_hw_breakpoint *info; int val; @@ -290,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (breakinfo[i].addr == addr && breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; if (hw_break_release_slot(i)) { @@ -311,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) int cpu = raw_smp_processor_id(); struct perf_event *bp; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); @@ -331,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (!breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; switch (bptype) { @@ -395,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; if (dbg_is_early) { @@ -640,7 +640,7 @@ void kgdb_arch_late(void) attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); -- cgit From 7645e4320497b35ce9fb6c2269ebcd57af9fe735 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 6 Aug 2010 12:18:11 -0700 Subject: x86, kvm: Remove cast obsoleted by set_64bit() prototype cleanup KVM ended up having to put a pretty ugly wrapper around set_64bit() in order to get the type right. Now set_64bit() takes the expected u64 type, and this wrapper can be cleaned up. Signed-off-by: H. Peter Anvin Cc: Avi Kivity LKML-Reference: <4C5C4E7A.8040603@kernel.org> Signed-off-by: Linus Torvalds --- arch/x86/kvm/mmu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0dcc95e09876..311f6dad8951 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -281,11 +281,7 @@ static gfn_t pse36_gfn_delta(u32 gpte) static void __set_spte(u64 *sptep, u64 spte) { -#ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); -#else - set_64bit((unsigned long long *)sptep, spte); -#endif + set_64bit(sptep, spte); } static u64 __xchg_spte(u64 *sptep, u64 new_spte) -- cgit From 7e005f79791dcd58436c88ded4a7f5aed1b82147 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 31 May 2010 15:59:04 +0900 Subject: remove needless ISA_DMA_THRESHOLD Architectures don't need to define ISA_DMA_THRESHOLD anymore. Signed-off-by: FUJITA Tomonori Acked-by: James Bottomley Acked-by: David Howells Signed-off-by: Jens Axboe --- arch/x86/include/asm/scatterlist.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index fb0b1874396f..4240878b9d76 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -3,7 +3,6 @@ #include -#define ISA_DMA_THRESHOLD (0x00ffffff) #define ARCH_HAS_SG_CHAIN #endif /* _ASM_X86_SCATTERLIST_H */ -- cgit From 597781f3e51f48ef8e67be772196d9e9673752c4 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Mon, 9 Aug 2010 17:18:32 -0700 Subject: kmap_atomic: make kunmap_atomic() harder to misuse kunmap_atomic() is currently at level -4 on Rusty's "Hard To Misuse" list[1] ("Follow common convention and you'll get it wrong"), except in some architectures when CONFIG_DEBUG_HIGHMEM is set[2][3]. kunmap() takes a pointer to a struct page; kunmap_atomic(), however, takes takes a pointer to within the page itself. This seems to once in a while trip people up (the convention they are following is the one from kunmap()). Make it much harder to misuse, by moving it to level 9 on Rusty's list[4] ("The compiler/linker won't let you get it wrong"). This is done by refusing to build if the type of its first argument is a pointer to a struct page. The real kunmap_atomic() is renamed to kunmap_atomic_notypecheck() (which is what you would call in case for some strange reason calling it with a pointer to a struct page is not incorrect in your code). The previous version of this patch was compile tested on x86-64. [1] http://ozlabs.org/~rusty/index.cgi/tech/2008-04-01.html [2] In these cases, it is at level 5, "Do it right or it will always break at runtime." [3] At least mips and powerpc look very similar, and sparc also seems to share a common ancestor with both; there seems to be quite some degree of copy-and-paste coding here. The include/asm/highmem.h file for these three archs mention x86 CPUs at its top. [4] http://ozlabs.org/~rusty/index.cgi/tech/2008-03-30.html [5] As an aside, could someone tell me why mn10300 uses unsigned long as the first parameter of kunmap_atomic() instead of void *? Signed-off-by: Cesar Eduardo Barros Cc: Russell King (arch/arm) Cc: Ralf Baechle (arch/mips) Cc: David Howells (arch/frv, arch/mn10300) Cc: Koichi Yasutake (arch/mn10300) Cc: Kyle McMartin (arch/parisc) Cc: Helge Deller (arch/parisc) Cc: "James E.J. Bottomley" (arch/parisc) Cc: Benjamin Herrenschmidt (arch/powerpc) Cc: Paul Mackerras (arch/powerpc) Cc: "David S. Miller" (arch/sparc) Cc: Thomas Gleixner (arch/x86) Cc: Ingo Molnar (arch/x86) Cc: "H. Peter Anvin" (arch/x86) Cc: Arnd Bergmann (include/asm-generic) Cc: Rusty Russell ("Hard To Misuse" list) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/highmem.h | 2 +- arch/x86/mm/highmem_32.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a726650fc80f..8caac76ac324 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -61,7 +61,7 @@ void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); +void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 63a6ba66cbe0..5e8fa12ef861 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page, enum km_type type) return kmap_atomic_prot(page, type, kmap_prot); } -void kunmap_atomic(void *kvaddr, enum km_type type) +void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -102,7 +102,7 @@ struct page *kmap_atomic_to_page(void *ptr) EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_notypecheck); EXPORT_SYMBOL(kmap_atomic_prot); EXPORT_SYMBOL(kmap_atomic_to_page); -- cgit From 4e60c86bd9e5a7110ed28874d0b6592186550ae8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 9 Aug 2010 17:19:03 -0700 Subject: gcc-4.6: mm: fix unused but set warnings No real bugs, just some dead code and some fixups. Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 181be528c612..076052cd62be 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -126,8 +126,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) -#define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ +#define pte_unmap(pte) ((void)(pte))/* NOP */ +#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) -- cgit From d7a7c573936a86474c4a5090a45a4bc6e680c117 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 9 Aug 2010 17:20:33 -0700 Subject: x86, ia64, smp: use workqueues unconditionally during do_boot_cpu() Workqueues are now initialized as part of the early_initcall(). So they are available for use during cold boot process aswell. Signed-off-by: Suresh Siddha Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tejun Heo Cc: Oleg Nesterov Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51620953b18a..a5e928b0cb5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); -- cgit From 8cbd84f2dd4e52a8771b191030c374ba3e56d291 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 10 Aug 2010 15:35:10 -0700 Subject: x86: fix up system call numbering nit As pointed out by Jiri Slaby: when I resolved the the 32-bit x85 system call entry tables for prlimit (due to the conflict with fanotify), I forgot to add the numbering in comments that we do for every fifth entry. Reported-by: Jiri Slaby Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 91dc4bb13032..b86feabed69b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -844,5 +844,5 @@ ia32_sys_call_table: .quad compat_sys_recvmmsg .quad sys_fanotify_init .quad sys32_fanotify_mark - .quad sys_prlimit64 + .quad sys_prlimit64 /* 340 */ ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4802accb9d8d..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -339,4 +339,4 @@ ENTRY(sys_call_table) .long sys_recvmmsg .long sys_fanotify_init .long sys_fanotify_mark - .long sys_prlimit64 + .long sys_prlimit64 /* 340 */ -- cgit From 4565f0170dfc849b3629c27d769db800467baa62 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Aug 2010 18:03:22 -0700 Subject: dma-mapping: unify dma_get_cache_alignment implementations dma_get_cache_alignment returns the minimum DMA alignment. Architectures defines it as ARCH_DMA_MINALIGN (formally ARCH_KMALLOC_MINALIGN). So we can unify dma_get_cache_alignment implementations. Note that some architectures implement dma_get_cache_alignment wrongly. dma_get_cache_alignment() should return the minimum DMA alignment. So fully-coherent architectures should return 1. This patch also fixes this issue. Signed-off-by: FUJITA Tomonori Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/dma-mapping.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index ac91eed21061..f9c67e83f648 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -87,13 +87,6 @@ dma_cache_sync(struct device *dev, void *vaddr, size_t size, flush_write_buffers(); } -static inline int dma_get_cache_alignment(void) -{ - /* no easy way to get cache size on all x86, so return the - * maximum possible, to be safe */ - return boot_cpu_data.x86_clflush_size; -} - static inline unsigned long dma_alloc_coherent_mask(struct device *dev, gfp_t gfp) { -- cgit From 3b9c6c11f519718d618f5d7c9508daf78b207f6f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Aug 2010 18:03:25 -0700 Subject: dma-mapping: remove dma_is_consistent API Architectures implement dma_is_consistent() in different ways (some misinterpret the definition of API in DMA-API.txt). So it hasn't been so useful for drivers. We have only one user of the API in tree. Unlikely out-of-tree drivers use the API. Even if we fix dma_is_consistent() in some architectures, it doesn't look useful at all. It was invented long ago for some old systems that can't allocate coherent memory at all. It's better to export only APIs that are definitely necessary for drivers. Let's remove this API. Signed-off-by: FUJITA Tomonori Cc: James Bottomley Reviewed-by: Konrad Rzeszutek Wilk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index f9c67e83f648..d4c419f883a0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -54,7 +54,6 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) -#define dma_is_consistent(d, h) (1) extern int dma_supported(struct device *hwdev, u64 mask); extern int dma_set_mask(struct device *dev, u64 mask); -- cgit From a3da323420d5aa6f7bd15efc7bf34cd6d19e1f1a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 8 Aug 2010 02:37:23 +0900 Subject: [CPUFREQ] add missing __percpu markup in pcc-cpufreq.c pcc_cpu_info is a percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5bbb622..994230d4dc4e 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -110,7 +110,7 @@ struct pcc_cpu { u32 output_offset; }; -static struct pcc_cpu *pcc_cpu_info; +static struct pcc_cpu __percpu *pcc_cpu_info; static int pcc_cpufreq_verify(struct cpufreq_policy *policy) { -- cgit From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 9 Aug 2010 14:20:10 -0700 Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods The current computation, introduced with f12a15be63, of FSEC_PER_SEC using the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only with 32bit integers on small machines, resulting in an overflow and a *very* short intervals being programmed. An interrupt storm follows. Note that we also have to specify FSEC_PER_SEC as being long long to overcome the same limitations. Signed-off-by: Chris Wilson Signed-off-by: John Stultz Cc: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 33dbcc4ec5ff..351f9c0fea1f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) * scaled math multiplication factor for nanosecond to hpet tick * conversion. */ - hpet_freq = 1000000000000000ULL; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); evt->mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, evt->shift); @@ -837,7 +837,7 @@ static int hpet_clocksource_register(void) * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period */ - hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); -- cgit