From ad66dd340f561bdde2285992314d9e4fd9b6191e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 11 Jul 2008 13:11:56 -0700 Subject: x2apic: xen64 paravirt basic apic ops Define the Xen specific basic apic ops, in additon to paravirt apic ops, with some misc warning fixes. Signed-off-by: Suresh Siddha Cc: Jeremy Fitzhardinge Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dcd4e51f2f16..54e255667530 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -548,16 +548,45 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static u32 xen_apic_read(unsigned long reg) +static u32 xen_apic_read(u32 reg) { return 0; } -static void xen_apic_write(unsigned long reg, u32 val) +static void xen_apic_write(u32 reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); } + +#ifdef CONFIG_X86_64 +static u64 xen_apic_icr_read(void) +{ + return 0; +} + +static void xen_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void xen_apic_wait_icr_idle(void) +{ + return; +} + +static struct apic_ops xen_basic_apic_ops = { + .read = xen_apic_read, + .write = xen_apic_write, + .write_atomic = xen_apic_write, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, + .wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_apic_wait_icr_idle, +}; +#endif + #endif static void xen_flush_tlb(void) @@ -1130,9 +1159,11 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC +#ifndef CONFIG_X86_64 .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, +#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1291,6 +1322,12 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; +#ifdef CONFIG_X86_64 + /* + * for 64bit, set up the basic apic ops aswell. + */ + apic_ops = &xen_basic_apic_ops; +#endif if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; -- cgit From 94a8c3c2437c8946f1b6c8e0b2c560a7db8ed3c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Jul 2008 22:19:35 -0700 Subject: x86: let 32bit use apic_ops too - fix fix for pv - clean up the namespace there too. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 54e255667530..d11dda7ebd7a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -559,7 +559,6 @@ static void xen_apic_write(u32 reg, u32 val) WARN_ON(1); } -#ifdef CONFIG_X86_64 static u64 xen_apic_icr_read(void) { return 0; @@ -576,6 +575,11 @@ static void xen_apic_wait_icr_idle(void) return; } +static u32 xen_safe_apic_wait_icr_idle(void) +{ + return 0; +} + static struct apic_ops xen_basic_apic_ops = { .read = xen_apic_read, .write = xen_apic_write, @@ -583,9 +587,8 @@ static struct apic_ops xen_basic_apic_ops = { .icr_read = xen_apic_icr_read, .icr_write = xen_apic_icr_write, .wait_icr_idle = xen_apic_wait_icr_idle, - .safe_wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, }; -#endif #endif @@ -1159,11 +1162,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC -#ifndef CONFIG_X86_64 - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, -#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1322,9 +1320,10 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; -#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_LOCAL_APIC /* - * for 64bit, set up the basic apic ops aswell. + * set up the basic apic ops. */ apic_ops = &xen_basic_apic_ops; #endif -- cgit From caf43bf7c6a55e89b6df5179df434d67e24aa32e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 20 Jul 2008 14:06:50 +0200 Subject: x86, xen: fix apic_ops build on UP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/xen/enlighten.c:615: error: variable ‘xen_basic_apic_ops’ has initializer but incomplete type arch/x86/xen/enlighten.c:616: error: unknown field ‘read’ specified in initializer [...] Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 008b7b69581e..e4d1459a63df 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include -- cgit From 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 14:21:18 -0700 Subject: x86/paravirt/xen: properly fill out the ldt ops LTP testing showed that Xen does not properly implement sys_modify_ldt(). This patch does the final little bits needed to make the ldt work properly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..06219e60e9c8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,6 +325,26 @@ static unsigned long xen_store_tr(void) return 0; } +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readonly(v + i); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readwrite(v + i); +} + static void xen_set_ldt(const void *addr, unsigned entries) { struct mmuext_op *op; @@ -1220,6 +1240,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gs_index = xen_load_gs_index, #endif + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, -- cgit From b56afe1d41653fb07ab1b5af5ccc12001c4dd5a0 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 06219e60e9c8..e2767c28dac7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1347,7 +1347,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit From a05d2ebab28011c2f3f520833f4bfdd2fd1b9c02 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 27 Jul 2008 08:45:02 -0700 Subject: xen: fix allocation and use of large ldts When the ldt gets to more than 1 page in size, the kernel uses vmalloc to allocate it. This means that: - when making the ldt RO, we must update the pages in both the vmalloc mapping and the linear mapping to make sure there are no RW aliases. - we need to use arbitrary_virt_to_machine to compute the machine addr for each update Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 51 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e2767c28dac7..b011e4a5dbbe 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,24 +325,55 @@ static unsigned long xen_store_tr(void) return 0; } +/* + * If 'v' is a vmalloc mapping, then find the linear mapping of the + * page (if any) and also set its protections to match: + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); +} + static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readonly(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readwrite(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -446,7 +477,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -579,7 +610,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); -- cgit From d89961e2dc87b6e30b8e3f60bd2af5cd92cf4643 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 13:48:58 -0700 Subject: xen: suppress known wrmsrs In general, Xen doesn't support wrmsr from an unprivileged domain; it just ends up ignoring the instruction and printing a message on the console. Given that there are sets of MSRs we know the kernel will try to write to, but we don't care, just eat them in xen_write_msr to cut down on console noise. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b011e4a5dbbe..b795470ec069 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,6 +854,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = -EFAULT; break; #endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + default: ret = native_write_msr_safe(msr, low, high); } -- cgit From 0d1edf46ba229b46efacf75c0544b88c05a7b266 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 11:53:57 -0700 Subject: xen: compile irq functions without -pg for ftrace For some reason I managed to miss a bunch of irq-related functions which also need to be compiled without -pg when using ftrace. This patch moves them into their own file, and starts a cleanup process I've been meaning to do anyway. Signed-off-by: Jeremy Fitzhardinge Cc: Sam Ravnborg Cc: "Alex Nixon (Intern)" Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 122 +---------------------------------------------- 1 file changed, 2 insertions(+), 120 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b795470ec069..cf8b3a93122b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -226,94 +225,6 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } -} - -static void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} - -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ - - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} - -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); -} - -static void xen_halt(void) -{ - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} - static void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -1308,36 +1219,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; -static void __init __xen_init_IRQ(void) -{ -#ifdef CONFIG_X86_64 - int i; - - /* Create identity vector->irq map */ - for(i = 0; i < NR_VECTORS; i++) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[i] = i; - } -#endif /* CONFIG_X86_64 */ - - xen_init_IRQ(); -} - -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, @@ -1740,10 +1621,11 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; -- cgit From cef43bf6b3afd819f7cdcba356af0e8220fb3789 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 13:33:44 -0700 Subject: xen: fix allocation and use of large ldts, cleanup Add a proper comment for set_aliased_prot() and fix an unsigned long/void * warning. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8b3a93122b..04ec69e4d02e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -237,8 +237,10 @@ static unsigned long xen_store_tr(void) } /* - * If 'v' is a vmalloc mapping, then find the linear mapping of the - * page (if any) and also set its protections to match: + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. */ static void set_aliased_prot(void *v, pgprot_t prot) { @@ -387,8 +389,7 @@ static void xen_load_gs_index(unsigned int idx) static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); -- cgit From 169ad16bb87c10a3f7c108bb7008ebc0270f617a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 28 Jul 2008 18:32:09 -0300 Subject: xen_alloc_ptpage: cast PFN_PHYS() argument to unsigned long Currently paravirt_ops alloc_p*() uses u32 for the pfn args. We should change that later, but while the pfn parameter is still u32, we need to cast the PFN_PHYS() argument at xen_alloc_ptpage() to unsigned long, otherwise it will lose bits on the shift. I think PFN_PHYS() should behave better when fed with smaller integers, but a cast to unsigned long won't be enough for all cases on 32-bit PAE, and a cast to u64 would be overkill for most users of PFN_PHYS(). We could have two different flavors of PFN_PHYS: one for low pages only (unsigned long) and another that works for any page (u64)), but while we don't have it, we will need the cast to unsigned long on xen_alloc_ptpage(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 04ec69e4d02e..53afa14eb314 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -822,7 +822,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else -- cgit From 6e833587e11ed0dbf12e647127f2650e2f80b26d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:16:17 -0700 Subject: xen: clean up domain mode predicates There are four operating modes Xen code may find itself running in: - native - hvm domain - pv dom0 - pv domU Clean up predicates for testing for these states to make them more consistent. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 53afa14eb314..b106e825d266 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1613,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1650,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) + if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a @@ -1685,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); -- cgit From f86399396ce7a4f4069828b7dceac5aa5113dfb5 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 30 Jul 2008 18:32:27 -0300 Subject: x86, paravirt_ops: use unsigned long instead of u32 for alloc_p*() pfn args This patch changes the pfn args from 'u32' to 'unsigned long' on alloc_p*() functions on paravirt_ops, and the corresponding implementations for Xen and VMI. The prototypes for CONFIG_PARAVIRT=n are already using unsigned long, so paravirt.h now matches the prototypes on asm-x86/pgalloc.h. It shouldn't result in any changes on generated code on 32-bit, with or without CONFIG_PARAVIRT. On both cases, 'codiff -f' didn't show any change after applying this patch. On 64-bit, there are (expected) binary changes only when CONFIG_PARAVIRT is enabled, as the patch is really supposed to change the size of the pfn args. [ v2: KVM_GUEST: use the right parameter type on kvm_release_pt() ] Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Acked-by: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..db970bdc5e3d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -812,7 +812,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -822,7 +822,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(u32 pfn) +static void xen_release_pte_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -838,7 +838,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -856,12 +856,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -909,7 +909,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) } /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(u32 pfn, unsigned level) +static void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -923,23 +923,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pte(u32 pfn) +static void xen_release_pte(unsigned long pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pmd(u32 pfn) +static void xen_release_pmd(unsigned long pfn) { xen_release_ptpage(pfn, PT_PMD); } #if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); } -static void xen_release_pud(u32 pfn) +static void xen_release_pud(unsigned long pfn) { xen_release_ptpage(pfn, PT_PUD); } -- cgit From 6a9e91846bf52cc70a0417de19fdfac224c435c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:25 -0700 Subject: xen: fix pinning when not using split pte locks We only pin PTE pages when using split PTE locks, so don't do the pin/unpin when attaching/detaching pte pages to a pinned pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b106e825d266..8ca2f88bde1e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -826,7 +826,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of @@ -894,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -- cgit From db053b86f4b1ec790da2dafe2acb93be76288bb9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Oct 2008 16:41:31 -0700 Subject: xen: clean up x86-64 warnings There are a couple of Xen features which rely on directly accessing per-cpu data via a segment register, which is not yet available on x86-64. In the meantime, just disable direct access to the vcpu info structure; this leaves some of the code as dead, but it will come to life in time, and the warnings are suppressed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 61 ++++++++---------------------------------------- 1 file changed, 10 insertions(+), 51 deletions(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ca2f88bde1e..85692c9f6496 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -112,7 +112,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -941,6 +948,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -959,6 +967,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -1025,7 +1034,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ -#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -1035,7 +1043,6 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } -#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1056,12 +1063,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { -#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); -#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1399,48 +1404,11 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } -#ifdef CONFIG_X86_64 -static void walk(pgd_t *pgd, unsigned long addr) -{ - unsigned l4idx = pgd_index(addr); - unsigned l3idx = pud_index(addr); - unsigned l2idx = pmd_index(addr); - unsigned l1idx = pte_index(addr); - pgd_t l4; - pud_t l3; - pmd_t l2; - pte_t l1; - - xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", - pgd, addr, l4idx, l3idx, l2idx, l1idx); - - l4 = pgd[l4idx]; - xen_raw_printk(" l4: %016lx\n", l4.pgd); - xen_raw_printk(" %016lx\n", pgd_val(l4)); - - l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; - xen_raw_printk(" l3: %016lx\n", l3.pud); - xen_raw_printk(" %016lx\n", pud_val(l3)); - - l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; - xen_raw_printk(" l2: %016lx\n", l2.pmd); - xen_raw_printk(" %016lx\n", pmd_val(l2)); - - l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; - xen_raw_printk(" l1: %016lx\n", l1.pte); - xen_raw_printk(" %016lx\n", pte_val(l1)); -} -#endif - static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", - addr, pfn, get_phys_to_machine(pfn), - pgprot_val(prot), pte.pte); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1698,15 +1666,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); -#if 0 - xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", - &boot_params, __pa_symbol(&boot_params), - __va(__pa_symbol(&boot_params))); - - walk(pgd, &boot_params); - walk(pgd, __va(__pa(&boot_params))); -#endif - /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); -- cgit From 5dc64a3442b98eaa0e3730c35fcf00cf962a93e7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 10 Oct 2008 11:27:38 +0100 Subject: xen: do not reserve 2 pages of padding between hypervisor and fixmap. When reserving space for the hypervisor the Xen paravirt backend adds an extra two pages (this was carried forward from the 2.6.18-xen tree which had them "for safety"). Depending on various CONFIG options this can cause the boot time fixmaps to span multiple PMDs which is not supported and triggers a WARN in early_ioremap_init(). This was exposed by 2216d199b1430d1c0affb1498a9ebdbd9c0de439 which moved the dmi table parsing earlier. x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. There is no real reason to reserve these two extra pages and the fixmap already incorporates FIX_HOLE which serves the same purpose. None of the other callers of reserve_top_address do this. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 85692c9f6496..977a54255fb4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1370,7 +1370,7 @@ static void __init xen_reserve_top(void) if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); #endif /* CONFIG_X86_32 */ } -- cgit From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:27:03 -0700 Subject: mm: rewrite vmap layer Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and provide a fast, scalable percpu frontend for small vmaps (requires a slightly different API, though). The biggest problem with vmap is actually vunmap. Presently this requires a global kernel TLB flush, which on most architectures is a broadcast IPI to all CPUs to flush the cache. This is all done under a global lock. As the number of CPUs increases, so will the number of vunmaps a scaled workload will want to perform, and so will the cost of a global TLB flush. This gives terrible quadratic scalability characteristics. Another problem is that the entire vmap subsystem works under a single lock. It is a rwlock, but it is actually taken for write in all the fast paths, and the read locking would likely never be run concurrently anyway, so it's just pointless. This is a rewrite of vmap subsystem to solve those problems. The existing vmalloc API is implemented on top of the rewritten subsystem. The TLB flushing problem is solved by using lazy TLB unmapping. vmap addresses do not have to be flushed immediately when they are vunmapped, because the kernel will not reuse them again (would be a use-after-free) until they are reallocated. So the addresses aren't allocated again until a subsequent TLB flush. A single TLB flush then can flush multiple vunmaps from each CPU. XEN and PAT and such do not like deferred TLB flushing because they can't always handle multiple aliasing virtual addresses to a physical address. They now call vm_unmap_aliases() in order to flush any deferred mappings. That call is very expensive (well, actually not a lot more expensive than a single vunmap under the old scheme), however it should be OK if not called too often. The virtual memory extent information is stored in an rbtree rather than a linked list to improve the algorithmic scalability. There is a per-CPU allocator for small vmaps, which amortizes or avoids global locking. To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces must be used in place of vmap and vunmap. Vmalloc does not use these interfaces at the moment, so it will not be quite so scalable (although it will use lazy TLB flushing). As a quick test of performance, I ran a test that loops in the kernel, linearly mapping then touching then unmapping 4 pages. Different numbers of tests were run in parallel on an 4 core, 2 socket opteron. Results are in nanoseconds per map+touch+unmap. threads vanilla vmap rewrite 1 14700 2900 2 33600 3000 4 49500 2800 8 70631 2900 So with a 8 cores, the rewritten version is already 25x faster. In a slightly more realistic test (although with an older and less scalable version of the patch), I ripped the not-very-good vunmap batching code out of XFS, and implemented the large buffer mapping with vm_map_ram and vm_unmap_ram... along with a couple of other tricks, I was able to speed up a large directory workload by 20x on a 64 CPU system. I believe vmap/vunmap is actually sped up a lot more than 20x on such a system, but I'm running into other locks now. vmap is pretty well blown off the profiles. Before: 1352059 total 0.1401 798784 _write_lock 8320.6667 <- vmlist_lock 529313 default_idle 1181.5022 15242 smp_call_function 15.8771 <- vmap tlb flushing 2472 __get_vm_area_node 1.9312 <- vmap 1762 remove_vm_area 4.5885 <- vunmap 316 map_vm_area 0.2297 <- vmap 312 kfree 0.1950 300 _spin_lock 3.1250 252 sn_send_IPI_phys 0.4375 <- tlb flushing 238 vmap 0.8264 <- vmap 216 find_lock_page 0.5192 196 find_next_bit 0.3603 136 sn2_send_IPI 0.2024 130 pio_phys_write_mmr 2.0312 118 unmap_kernel_range 0.1229 After: 78406 total 0.0081 40053 default_idle 89.4040 33576 ia64_spinlock_contention 349.7500 1650 _spin_lock 17.1875 319 __reg_op 0.5538 281 _atomic_dec_and_lock 1.0977 153 mutex_unlock 1.5938 123 iget_locked 0.1671 117 xfs_dir_lookup 0.1662 117 dput 0.1406 114 xfs_iget_core 0.0268 92 xfs_da_hashname 0.1917 75 d_alloc 0.0670 68 vmap_page_range 0.0462 <- vmap 58 kmem_cache_alloc 0.0604 57 memset 0.0540 52 rb_next 0.1625 50 __copy_user 0.0208 49 bitmap_find_free_region 0.2188 <- vmap 46 ia64_sn_udelay 0.1106 45 find_inode_fast 0.1406 42 memcmp 0.2188 42 finish_task_switch 0.1094 42 __d_lookup 0.0410 40 radix_tree_lookup_slot 0.1250 37 _spin_unlock_irqrestore 0.3854 36 xfs_bmapi 0.0050 36 kmem_cache_free 0.0256 35 xfs_vn_getattr 0.0322 34 radix_tree_lookup 0.1062 33 __link_path_walk 0.0035 31 xfs_da_do_buf 0.0091 30 _xfs_buf_find 0.0204 28 find_get_page 0.0875 27 xfs_iread 0.0241 27 __strncpy_from_user 0.2812 26 _xfs_buf_initialize 0.0406 24 _xfs_buf_lookup_pages 0.0179 24 vunmap_page_range 0.0250 <- vunmap 23 find_lock_page 0.0799 22 vm_map_ram 0.0087 <- vmap 20 kfree 0.0125 19 put_page 0.0330 18 __kmalloc 0.0176 17 xfs_da_node_lookup_int 0.0086 17 _read_lock 0.0885 17 page_waitqueue 0.0664 vmap has gone from being the top 5 on the profiles and flushing the crap out of all TLBs, to using less than 1% of kernel time. [akpm@linux-foundation.org: cleanups, section fix] [akpm@linux-foundation.org: fix build on alpha] Signed-off-by: Nick Piggin Cc: Jeremy Fitzhardinge Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen/enlighten.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0013a729b41d..b61534c7a4c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l /* make sure there are no stray mappings of this page */ kmap_flush_unused(); + vm_unmap_aliases(); } } -- cgit