diff options
Diffstat (limited to 'arch/x86/kvm')
44 files changed, 1424 insertions, 877 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c42613cfb5ba..fe03bd978761 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_after_set_cpuid(vcpu); } -static int is_efer_nx(void) -{ - return host_efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { - cpuid_entry_clear(entry, X86_FEATURE_NX); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) void kvm_set_cpu_caps(void) { - unsigned int f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); @@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void) F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) ); @@ -765,7 +739,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; - edx.split.anythread_deprecated = 1; + if (cap.version) + edx.split.anythread_deprecated = 1; edx.split.reserved1 = 0; edx.split.reserved2 = 0; @@ -940,8 +915,21 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); unsigned phys_as = entry->eax & 0xff; - if (!g_phys_as) + /* + * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as + * the guest operates in the same PA space as the host, i.e. + * reductions in MAXPHYADDR for memory encryption affect shadow + * paging, too. + * + * If TDP is enabled but an explicit guest MAXPHYADDR is not + * provided, use the raw bare metal MAXPHYADDR as reductions to + * the HPAs do not affect GPAs. + */ + if (!tdp_enabled) + g_phys_as = boot_cpu_data.x86_phys_bits; + else if (!g_phys_as) g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); @@ -964,12 +952,18 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) case 0x8000001a: case 0x8000001e: break; - /* Support memory encryption cpuid if host supports it */ case 0x8000001F: - if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) + if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; - else + } else { cpuid_entry_override(entry, CPUID_8000_001F_EAX); + + /* + * Enumerate '0' for "PA bits reduction", the adjusted + * MAXPHYADDR is enumerated directly (see 0x80000008). + */ + entry->ebx &= ~GENMASK(11, 6); + } break; /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95a98413dc32..54a83a744538 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -7,6 +7,8 @@ #include <linux/kvm_host.h> #include <linux/debugfs.h> #include "lapic.h" +#include "mmu.h" +#include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { @@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ &vcpu_tsc_scaling_frac_fops); } } + +/* + * This covers statistics <1024 (11=log(1024)+1), which should be enough to + * cover RMAP_RECYCLE_THRESHOLD. + */ +#define RMAP_LOG_SIZE 11 + +static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; + +static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) +{ + struct kvm_rmap_head *rmap; + struct kvm *kvm = m->private; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned int lpage_size, index; + /* Still small enough to be on the stack */ + unsigned int *log[KVM_NR_PAGE_SIZES], *cur; + int i, j, k, l, ret; + + ret = -ENOMEM; + memset(log, 0, sizeof(log)); + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); + if (!log[i]) + goto out; + } + + mutex_lock(&kvm->slots_lock); + write_lock(&kvm->mmu_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + for (j = 0; j < slots->used_slots; j++) { + slot = &slots->memslots[j]; + for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { + rmap = slot->arch.rmap[k]; + lpage_size = kvm_mmu_slot_lpages(slot, k + 1); + cur = log[k]; + for (l = 0; l < lpage_size; l++) { + index = ffs(pte_list_count(&rmap[l])); + if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) + index = RMAP_LOG_SIZE - 1; + cur[index]++; + } + } + } + } + + write_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + + /* index=0 counts no rmap; index=1 counts 1 rmap */ + seq_printf(m, "Rmap_Count:\t0\t1\t"); + for (i = 2; i < RMAP_LOG_SIZE; i++) { + j = 1 << (i - 1); + k = (1 << i) - 1; + seq_printf(m, "%d-%d\t", j, k); + } + seq_printf(m, "\n"); + + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); + cur = log[i]; + for (j = 0; j < RMAP_LOG_SIZE; j++) + seq_printf(m, "%d\t", cur[j]); + seq_printf(m, "\n"); + } + + ret = 0; +out: + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) + kfree(log[i]); + + return ret; +} + +static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + return single_open(file, kvm_mmu_rmaps_stat_show, kvm); +} + +static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + kvm_put_kvm(kvm); + + return single_release(inode, file); +} + +static const struct file_operations mmu_rmaps_stat_fops = { + .open = kvm_mmu_rmaps_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_mmu_rmaps_stat_release, +}; + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, + &mmu_rmaps_stat_fops); + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b07592ca92f0..232a86a6faaf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + int auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + + if (!!auto_eoi_old == !!auto_eoi_new) + return; + + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + __kvm_request_apicv_update(vcpu->kvm, + !hv->synic_auto_eoi_used, + APICV_INHIBIT_REASON_HYPERV); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so request - * to deactivate APICV permanently. - */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; @@ -1933,7 +1951,7 @@ ret_success: void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { @@ -2016,6 +2034,7 @@ static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) static int kvm_hv_hypercall_complete(struct kvm_vcpu *vcpu, u64 result) { + trace_kvm_hv_hypercall_done(result); kvm_hv_hypercall_set_result(vcpu, result); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); @@ -2139,6 +2158,7 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code) int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct kvm_hv_hcall hc; u64 ret = HV_STATUS_SUCCESS; @@ -2173,17 +2193,25 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) hc.rep_idx = (hc.param >> HV_HYPERCALL_REP_START_OFFSET) & 0xfff; hc.rep = !!(hc.rep_cnt || hc.rep_idx); - if (hc.fast && is_xmm_fast_hypercall(&hc)) - kvm_hv_hypercall_read_xmm(&hc); - trace_kvm_hv_hypercall(hc.code, hc.fast, hc.rep_cnt, hc.rep_idx, hc.ingpa, hc.outgpa); - if (unlikely(!hv_check_hypercall_access(to_hv_vcpu(vcpu), hc.code))) { + if (unlikely(!hv_check_hypercall_access(hv_vcpu, hc.code))) { ret = HV_STATUS_ACCESS_DENIED; goto hypercall_complete; } + if (hc.fast && is_xmm_fast_hypercall(&hc)) { + if (unlikely(hv_vcpu->enforce_cpuid && + !(hv_vcpu->cpuid_cache.features_edx & + HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE))) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + kvm_hv_hypercall_read_xmm(&hc); + } + switch (hc.code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: if (unlikely(hc.rep)) { @@ -2466,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 629a09ca9860..0b80263d46d8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -541,17 +541,17 @@ static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, addr, len, val); } -static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, const void *val) { - return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_write(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } -static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, +static int picdev_elcr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int len, void *val) { - return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + return picdev_read(container_of(dev, struct kvm_pic, dev_elcr), addr, len, val); } @@ -577,9 +577,9 @@ static const struct kvm_io_device_ops picdev_slave_ops = { .write = picdev_slave_write, }; -static const struct kvm_io_device_ops picdev_eclr_ops = { - .read = picdev_eclr_read, - .write = picdev_eclr_write, +static const struct kvm_io_device_ops picdev_elcr_ops = { + .read = picdev_elcr_read, + .write = picdev_elcr_write, }; int kvm_pic_init(struct kvm *kvm) @@ -602,7 +602,7 @@ int kvm_pic_init(struct kvm *kvm) */ kvm_iodevice_init(&s->dev_master, &picdev_master_ops); kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); - kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); + kvm_iodevice_init(&s->dev_elcr, &picdev_elcr_ops); mutex_lock(&kvm->slots_lock); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, &s->dev_master); @@ -613,7 +613,7 @@ int kvm_pic_init(struct kvm *kvm) if (ret < 0) goto fail_unreg_2; - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_elcr); if (ret < 0) goto fail_unreg_1; @@ -647,7 +647,7 @@ void kvm_pic_destroy(struct kvm *kvm) mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); - kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_elcr); mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 698969e18fe3..ff005fe738a4 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 660401700075..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,21 +35,17 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID); + DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID]; + u8 vectors[KVM_MAX_VCPU_ID + 1]; }; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9b64abf9b3f1..650642b18d15 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -55,7 +55,7 @@ struct kvm_pic { int output; /* intr from master PIC */ struct kvm_io_device dev_master; struct kvm_io_device dev_slave; - struct kvm_io_device dev_eclr; + struct kvm_io_device dev_elcr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map @@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { @@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 83e6c6965f1e..e9688a9f7b57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + +static inline unsigned long +__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, + int level) +{ + return gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; +} + +static inline unsigned long +kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) +{ + return __kvm_mmu_slot_lpages(slot, slot->npages, level); +} + +static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) +{ + atomic64_add(count, &kvm->stat.pages[level - 1]); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 845d114ae075..2d7e61122af8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -53,6 +53,8 @@ #include <asm/kvm_page_track.h> #include "trace.h" +#include "paging.h" + extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; @@ -95,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -135,12 +138,22 @@ module_param(dbg, bool, 0644); #include <trace/events/kvm.h> -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -191,7 +204,7 @@ struct kvm_mmu_role_regs { * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ { \ return !!(regs->reg & flag); \ } @@ -213,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ } @@ -321,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { - /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { - exception->error_code |= PFERR_RSVD_MASK; - return UNMAPPED_GVA; - } - return gpa; } @@ -590,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; + int level = sptep_to_sp(sptep)->role.level; if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); @@ -603,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; + + kvm_update_page_stats(kvm, level, -1); pfn = spte_to_pfn(old_spte); @@ -620,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -684,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -784,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -797,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -891,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -901,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -928,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -967,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); @@ -982,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + u64 *sptep) { - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { - unsigned long idx; + struct pte_list_desc *desc, *next; + int i; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(kvm, desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) { - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; + struct pte_list_desc *desc; + unsigned int count = 0; - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); + if (!rmap_head->val) + return 0; + else if (!(rmap_head->val & 1)) + return 1; + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + count += desc->spte_count; + desc = desc->more; + } + + return count; +} + +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) +{ + unsigned long idx; + + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1018,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1117,7 +1187,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } @@ -1127,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) if (is_large_pte(*sptep)) { WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - --kvm->stat.lpages; return true; } @@ -1216,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1254,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1287,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1354,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1375,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(kvm, rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1419,13 +1479,13 @@ restart: need_flush = 1; if (pte_write(pte)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( *sptep, new_pfn); - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); mmu_spte_set(sptep, new_spte); } } @@ -1440,7 +1500,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1460,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -1582,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, @@ -1642,7 +1702,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); @@ -2230,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); @@ -2533,6 +2591,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page @@ -2555,9 +2614,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -2688,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - if (is_shadow_present_pte(*sptep)) { - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } + if (!was_rmapped) { + kvm_update_page_stats(vcpu->kvm, level, 1); + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); } return ret; @@ -2824,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; + int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -2835,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - return host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + return min(host_level, max_level); } int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2859,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K) - return level; - - *req_level = level = min(level, max_level); - /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - if (huge_page_disallowed) + *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); + if (level == PG_LEVEL_4K || huge_page_disallowed) return PG_LEVEL_4K; /* @@ -2937,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); - - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + if (is_shadow_present_pte(*it.sptep)) + continue; + + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, @@ -3094,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3113,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3178,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3192,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3427,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (WARN_ON_ONCE(!mmu->pml5_root)) { + r = -EIO; + goto out_unlock; + } + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + } } for (i = 0; i < 4; ++i) { @@ -3454,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3470,7 +3582,10 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *pml4_root, *pae_root; + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3483,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) return 0; /* - * This mess only works with 4-level paging and needs to be updated to - * work with 5-level paging. + * NPT, the only paging mode that uses this horror, uses a fixed number + * of levels for the shadow page tables, e.g. all MMUs are 4-level or + * all MMus are 5-level. Thus, this can safely require that pml5_root + * is allocated if the other roots are valid and pml5 is needed, as any + * prior MMU would also have required pml5. */ - if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) - return -EIO; - - if (mmu->pae_root && mmu->pml4_root) + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + (need_pml5 && mmu->pml5_root))) return -EIO; /* @@ -3507,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; +#ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; + if (!pml4_root) + goto err_pml4; + + if (need_pml5) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } +#endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3612,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3619,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3634,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3647,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3767,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + bool write, bool *writable, int *r) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3780,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return true; - - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; - return false; + goto out_retry; + + if (!kvm_is_visible_memslot(slot)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu)) { + *pfn = KVM_PFN_NOSLOT; + *writable = false; + return false; + } + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && + !kvm_apicv_activated(vcpu->kvm)) { + *r = RET_PF_EMULATE; + return true; + } } async = false; @@ -3800,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; + goto out_retry; } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; + goto out_retry; } *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable, hva); - return false; + +out_retry: + *r = RET_PF_RETRY; + return true; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -3826,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3839,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; @@ -4560,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5132,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5251,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* @@ -5274,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5306,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5317,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5330,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); + free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) @@ -5535,6 +5689,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + if (!kvm_mmu_init_tdp_mmu(kvm)) /* * No smp_load/store wrappers needed here as we are in @@ -5557,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +/* + * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end + * (not including it) + */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; @@ -5564,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; + write_lock(&kvm->mmu_lock); + + kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + if (kvm_memslots_have_rmaps(kvm)) { - write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5576,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - write_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - - read_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush, true); + gfn_end, flush); if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); - - read_unlock(&kvm->mmu_lock); + gfn_end - gfn_start); } + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5646,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5669,7 +5835,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -5685,10 +5851,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5724,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..bf2bdbf333c2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -31,13 +31,16 @@ extern bool dbg; #define IS_VALID_PAE_ROOT(x) (!!(x)) struct kvm_mmu_page { + /* + * Note, "link" through "spt" fit in a single 64 byte cache line on + * 64-bit kernels, keep it that way unless there's a reason not to. + */ struct list_head link; struct hlist_node hash_link; - struct list_head lpage_disallowed_link; + bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* @@ -59,6 +62,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); + struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -71,8 +75,6 @@ struct kvm_mmu_page { atomic_t write_flooding_count; #ifdef CONFIG_X86_64 - bool tdp_mmu_page; - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif @@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). @@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 91a9f7e0fd91..269f11f92fd0 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,6 +16,7 @@ #include <asm/kvm_page_track.h> +#include "mmu.h" #include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/paging.h b/arch/x86/kvm/mmu/paging.h new file mode 100644 index 000000000000..de8ab323bb70 --- /dev/null +++ b/arch/x86/kvm/mmu/paging.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Shadow paging constants/helpers that don't need to be #undef'd. */ +#ifndef __KVM_X86_PAGING_H +#define __KVM_X86_PAGING_H + +#define GUEST_PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) +#define PT64_LVL_ADDR_MASK(level) \ + (GUEST_PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#define PT64_LVL_OFFSET_MASK(level) \ + (GUEST_PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ + * PT64_LEVEL_BITS))) - 1)) +#endif /* __KVM_X86_PAGING_H */ + diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 490a028ddabe..7d03e9b7ccfa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -24,7 +24,7 @@ #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -57,7 +57,7 @@ #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name - #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK + #define PT_BASE_ADDR_MASK GUEST_PT64_BASE_ADDR_MASK #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 7a5ce9314107..eb7b227fc6cf 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -38,12 +38,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); #else #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #endif -#define PT64_LVL_ADDR_MASK(level) \ - (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) -#define PT64_LVL_OFFSET_MASK(level) \ - (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ - * PT64_LEVEL_BITS))) - 1)) #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ | shadow_x_mask | shadow_nx_mask | shadow_me_mask) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..64ccfc1fa553 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include <asm/cmpxchg.h> #include <trace/events/kvm.h> -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ @@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - kvm_lockdep_assert_mmu_lock_held(kvm, shared); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) @@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); + zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -256,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared, bool account_nx) + bool account_nx) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -446,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); - if (is_large_pte(old_spte) != is_large_pte(new_spte)) { - if (is_large_pte(old_spte)) - atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); - } - /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -478,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) @@ -527,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -538,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -559,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -724,13 +738,29 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush, bool shared) { + gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * No need to try to step down in the iterator when zapping all SPTEs, + * zapping the top-level non-leaf SPTEs will recurse on their children. + */ + int min_level = zap_all ? root->role.level : PG_LEVEL_4K; + + /* + * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will + * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, + * and so KVM will never install a SPTE for such addresses. + */ + end = min(end, max_gfn_host); + kvm_lockdep_assert_mmu_lock_held(kvm, shared); rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { @@ -744,9 +774,10 @@ retry: /* * If this is a non-last-level SPTE that covers a larger range * than should be zapped, continue, and zap the mappings at a - * lower level. + * lower level, except when zapping all SPTEs. */ - if ((iter.gfn < start || + if (!zap_all && + (iter.gfn < start || iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && !is_last_spte(iter.old_spte, iter.level)) continue; @@ -773,34 +804,26 @@ retry: * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - shared); + false); return flush; } void kvm_tdp_mmu_zap_all(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); bool flush = false; int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, - flush, false); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); if (flush) kvm_flush_remote_tlbs(kvm); @@ -838,7 +861,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); struct kvm_mmu_page *next_root; struct kvm_mmu_page *root; bool flush = false; @@ -854,8 +876,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) rcu_read_unlock(); - flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, - true); + flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); /* * Put the reference acquired in @@ -927,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1031,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, true, + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, huge_page_disallowed && req_level >= iter.level); @@ -1242,8 +1262,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1313,7 +1333,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1516,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1527,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..358f447d4012 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared); + gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush, - bool shared) + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, - shared); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false, false); + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); @@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, @@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 1d01da64c333..8052d92069e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else @@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_update_access_page(struct kvm *kvm, bool activate) +static int avic_alloc_access_page(struct kvm *kvm) { void __user *ret; int r = 0; mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_memslot_enabled == activate) || - (kvm->mm != current->mm)) + + if (kvm->arch.apic_access_memslot_enabled) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); + PAGE_SIZE); if (IS_ERR(ret)) { r = PTR_ERR(ret); goto out; } - kvm->arch.apic_access_memslot_enabled = activate; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_update_access_page(vcpu->kvm, true); + ret = avic_alloc_access_page(vcpu->kvm); if (ret) return ret; } @@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!enable_apicv || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; @@ -646,7 +632,7 @@ out: void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = svm->vmcb01.ptr; bool activated = kvm_vcpu_apicv_active(vcpu); if (!enable_apicv) @@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } vmcb_mark_dirty(vmcb, VMCB_AVIC); + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + svm_set_pi_irte_mode(vcpu, activated); } @@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) @@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); @@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) struct vcpu_svm *svm = to_svm(vcpu); svm->avic_is_running = is_run; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + if (is_run) avic_vcpu_load(vcpu, vcpu->cpu); else diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 21d03e3a5dfd..2545d0c61985 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -154,6 +154,13 @@ void recalc_intercepts(struct vcpu_svm *svm) for (i = 0; i < MAX_INTERCEPT; i++) c->intercepts[i] |= g->intercepts[i]; + + /* If SMI is not intercepted, ignore guest SMI intercept as well */ + if (!intercept_smi) + vmcb_clr_intercept(c, INTERCEPT_SMI); + + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); } static void copy_vmcb_control_area(struct vmcb_control_area *dst, @@ -304,8 +311,8 @@ static bool nested_vmcb_valid_sregs(struct kvm_vcpu *vcpu, return true; } -static void nested_load_control_from_vmcb12(struct vcpu_svm *svm, - struct vmcb_control_area *control) +void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control) { copy_vmcb_control_area(&svm->nested.ctl, control); @@ -499,7 +506,11 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { - const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + const u32 int_ctl_vmcb01_bits = + V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; + + const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + struct kvm_vcpu *vcpu = &svm->vcpu; /* @@ -511,7 +522,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id, * avic_physical_id. */ - WARN_ON(svm->vmcb01.ptr->control.int_ctl & AVIC_ENABLE_MASK); + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); /* Copied from vmcb01. msrpm_base can be overwritten later. */ svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl; @@ -531,8 +542,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; svm->vmcb->control.int_ctl = - (svm->nested.ctl.int_ctl & ~mask) | - (svm->vmcb01.ptr->control.int_ctl & mask); + (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | + (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; @@ -618,6 +629,11 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) struct kvm_host_map map; u64 vmcb12_gpa; + if (!svm->nested.hsave_msr) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if (is_smm(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -650,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. @@ -692,7 +703,28 @@ out: return ret; } -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +/* Copy state save area fields which are handled by VMRUN */ +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save) +{ + to_save->es = from_save->es; + to_save->cs = from_save->cs; + to_save->ss = from_save->ss; + to_save->ds = from_save->ds; + to_save->gdtr = from_save->gdtr; + to_save->idtr = from_save->idtr; + to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED; + to_save->efer = from_save->efer; + to_save->cr0 = from_save->cr0; + to_save->cr3 = from_save->cr3; + to_save->cr4 = from_save->cr4; + to_save->rax = from_save->rax; + to_save->rsp = from_save->rsp; + to_save->rip = from_save->rip; + to_save->cpl = 0; +} + +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) { to_vmcb->save.fs = from_vmcb->save.fs; to_vmcb->save.gs = from_vmcb->save.gs; @@ -1355,28 +1387,11 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; - svm->vmcb01.ptr->save.es = save->es; - svm->vmcb01.ptr->save.cs = save->cs; - svm->vmcb01.ptr->save.ss = save->ss; - svm->vmcb01.ptr->save.ds = save->ds; - svm->vmcb01.ptr->save.gdtr = save->gdtr; - svm->vmcb01.ptr->save.idtr = save->idtr; - svm->vmcb01.ptr->save.rflags = save->rflags | X86_EFLAGS_FIXED; - svm->vmcb01.ptr->save.efer = save->efer; - svm->vmcb01.ptr->save.cr0 = save->cr0; - svm->vmcb01.ptr->save.cr3 = save->cr3; - svm->vmcb01.ptr->save.cr4 = save->cr4; - svm->vmcb01.ptr->save.rax = save->rax; - svm->vmcb01.ptr->save.rsp = save->rsp; - svm->vmcb01.ptr->save.rip = save->rip; - svm->vmcb01.ptr->save.cpl = 0; - + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save); nested_load_control_from_vmcb12(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm); - kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 62926f1a5f7b..75e0b21ad07c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -28,8 +28,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - #ifndef CONFIG_KVM_AMD_SEV /* * When this config is not defined, SEV feature is not supported and APIs in @@ -64,6 +62,7 @@ static DEFINE_MUTEX(sev_bitmap_lock); unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long sev_me_mask; +static unsigned int nr_asids; static unsigned long *sev_asid_bitmap; static unsigned long *sev_reclaim_asid_bitmap; @@ -78,11 +77,11 @@ struct enc_region { /* Called with the sev_bitmap_lock held, or on shutdown */ static int sev_flush_asids(int min_asid, int max_asid) { - int ret, pos, error = 0; + int ret, asid, error = 0; /* Check if there are any ASIDs to reclaim before performing a flush */ - pos = find_next_bit(sev_reclaim_asid_bitmap, max_asid, min_asid); - if (pos >= max_asid) + asid = find_next_bit(sev_reclaim_asid_bitmap, nr_asids, min_asid); + if (asid > max_asid) return -EBUSY; /* @@ -115,15 +114,15 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) /* The flush process will flush all reclaimable SEV and SEV-ES ASIDs */ bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, - max_sev_asid); - bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + nr_asids); + bitmap_zero(sev_reclaim_asid_bitmap, nr_asids); return true; } static int sev_asid_new(struct kvm_sev_info *sev) { - int pos, min_asid, max_asid, ret; + int asid, min_asid, max_asid, ret; bool retry = true; enum misc_res_type type; @@ -143,11 +142,11 @@ static int sev_asid_new(struct kvm_sev_info *sev) * SEV-enabled guests must use asid from min_sev_asid to max_sev_asid. * SEV-ES-enabled guest can use from 1 to min_sev_asid - 1. */ - min_asid = sev->es_active ? 0 : min_sev_asid - 1; + min_asid = sev->es_active ? 1 : min_sev_asid; max_asid = sev->es_active ? min_sev_asid - 1 : max_sev_asid; again: - pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_asid); - if (pos >= max_asid) { + asid = find_next_zero_bit(sev_asid_bitmap, max_asid + 1, min_asid); + if (asid > max_asid) { if (retry && __sev_recycle_asids(min_asid, max_asid)) { retry = false; goto again; @@ -157,11 +156,11 @@ again: goto e_uncharge; } - __set_bit(pos, sev_asid_bitmap); + __set_bit(asid, sev_asid_bitmap); mutex_unlock(&sev_bitmap_lock); - return pos + 1; + return asid; e_uncharge: misc_cg_uncharge(type, sev->misc_cg, 1); put_misc_cg(sev->misc_cg); @@ -179,17 +178,16 @@ static int sev_get_asid(struct kvm *kvm) static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; - int cpu, pos; + int cpu; enum misc_res_type type; mutex_lock(&sev_bitmap_lock); - pos = sev->asid - 1; - __set_bit(pos, sev_reclaim_asid_bitmap); + __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); - sd->sev_vmcbs[pos] = NULL; + sd->sev_vmcbs[sev->asid] = NULL; } mutex_unlock(&sev_bitmap_lock); @@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not @@ -1272,8 +1271,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Pin guest memory */ guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!guest_page) - return -EFAULT; + if (IS_ERR(guest_page)) + return PTR_ERR(guest_page); /* allocate memory for header and transport buffer */ ret = -ENOMEM; @@ -1310,8 +1309,9 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) } /* Copy packet header to userspace. */ - ret = copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, - params.hdr_len); + if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr, + params.hdr_len)) + ret = -EFAULT; e_free_trans_data: kfree(trans_data); @@ -1463,11 +1463,12 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) data.trans_len = params.trans_len; /* Pin guest memory */ - ret = -EFAULT; guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!guest_page) + if (IS_ERR(guest_page)) { + ret = PTR_ERR(guest_page); goto e_free_trans; + } /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; @@ -1855,12 +1856,17 @@ void __init sev_hardware_setup(void) min_sev_asid = edx; sev_me_mask = 1UL << (ebx & 0x3f); - /* Initialize SEV ASID bitmaps */ - sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + /* + * Initialize SEV ASID bitmaps. Allocate space for ASID 0 in the bitmap, + * even though it's never used, so that the bitmap is indexed by the + * actual ASID. + */ + nr_asids = max_sev_asid + 1; + sev_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_asid_bitmap) goto out; - sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + sev_reclaim_asid_bitmap = bitmap_zalloc(nr_asids, GFP_KERNEL); if (!sev_reclaim_asid_bitmap) { bitmap_free(sev_asid_bitmap); sev_asid_bitmap = NULL; @@ -1905,7 +1911,7 @@ void sev_hardware_teardown(void) return; /* No need to take sev_bitmap_lock, all VMs have been destroyed. */ - sev_flush_asids(0, max_sev_asid); + sev_flush_asids(1, max_sev_asid); bitmap_free(sev_asid_bitmap); bitmap_free(sev_reclaim_asid_bitmap); @@ -1919,7 +1925,7 @@ int sev_cpu_init(struct svm_cpu_data *sd) if (!sev_enabled) return 0; - sd->sev_vmcbs = kcalloc(max_sev_asid + 1, sizeof(void *), GFP_KERNEL); + sd->sev_vmcbs = kcalloc(nr_asids, sizeof(void *), GFP_KERNEL); if (!sd->sev_vmcbs) return -ENOMEM; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8834822c00cd..05e8d4d27969 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -46,8 +46,6 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -198,6 +196,11 @@ module_param(avic, bool, 0444); bool __read_mostly dump_invalid_vmcb; module_param(dump_invalid_vmcb, bool, 0644); + +bool intercept_smi = true; +module_param(intercept_smi, bool, 0444); + + static bool svm_gp_erratum_intercept = true; static u8 rsm_ins_bytes[] = "\x0f\xaa"; @@ -256,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr) static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 - return PT64_ROOT_4LEVEL; + return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -457,11 +460,6 @@ static int has_svm(void) return 0; } - if (pgtable_l5_enabled()) { - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); - return 0; - } - return 1; } @@ -1010,7 +1008,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ @@ -1156,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1185,7 +1183,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_set_intercept(svm, INTERCEPT_INTR); svm_set_intercept(svm, INTERCEPT_NMI); - svm_set_intercept(svm, INTERCEPT_SMI); + + if (intercept_smi) + svm_set_intercept(svm, INTERCEPT_SMI); + svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); svm_set_intercept(svm, INTERCEPT_RDPMC); svm_set_intercept(svm, INTERCEPT_CPUID); @@ -1233,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1265,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; @@ -1322,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); - - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); - kvm_rdx_write(vcpu, eax); - - if (kvm_vcpu_apicv_active(vcpu) && !init_event) - avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1398,8 +1368,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) goto error_free_vmsa_page; } - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); @@ -1411,6 +1379,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_switch_vmcb(svm, &svm->vmcb01); init_vmcb(vcpu); + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + svm_init_osvw(vcpu); vcpu->arch.microcode_version = 0x01000065; @@ -1505,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); } - avic_vcpu_load(vcpu, cpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; @@ -1552,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } @@ -1560,8 +1533,11 @@ static void svm_set_vintr(struct vcpu_svm *svm) { struct vmcb_control_area *control; - /* The following fields are ignored when AVIC is enabled */ - WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + /* + * The following fields are ignored when AVIC is enabled + */ + WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); + svm_set_intercept(svm, INTERCEPT_VINTR); /* @@ -1578,17 +1554,18 @@ static void svm_set_vintr(struct vcpu_svm *svm) static void svm_clear_vintr(struct vcpu_svm *svm) { - const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ - svm->vmcb->control.int_ctl &= mask; + svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; if (is_guest_mode(&svm->vcpu)) { - svm->vmcb01.ptr->control.int_ctl &= mask; + svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); - svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; + + svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & + V_IRQ_INJECTION_BITS_MASK; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); @@ -1923,7 +1900,7 @@ static int npf_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); + u64 fault_address = svm->vmcb->control.exit_info_2; u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); @@ -2066,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2106,6 +2087,11 @@ static int nmi_interception(struct kvm_vcpu *vcpu) return 1; } +static int smi_interception(struct kvm_vcpu *vcpu) +{ + return 1; +} + static int intr_interception(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; @@ -2134,11 +2120,12 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) ret = kvm_skip_emulated_instruction(vcpu); if (vmload) { - nested_svm_vmloadsave(vmcb12, svm->vmcb); + svm_copy_vmloadsave_state(svm->vmcb, vmcb12); svm->sysenter_eip_hi = 0; svm->sysenter_esp_hi = 0; - } else - nested_svm_vmloadsave(svm->vmcb, vmcb12); + } else { + svm_copy_vmloadsave_state(vmcb12, svm->vmcb); + } kvm_vcpu_unmap(vcpu, &map, true); @@ -2941,7 +2928,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm_disable_lbrv(vcpu); break; case MSR_VM_HSAVE_PA: - svm->nested.hsave_msr = data; + /* + * Old kernels did not validate the value written to + * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid + * value to allow live migrating buggy or malicious guests + * originating from those kernels. + */ + if (!msr->host_initiated && !page_address_valid(vcpu, data)) + return 1; + + svm->nested.hsave_msr = data & PAGE_MASK; break; case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); @@ -2966,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->msr_decfg = data; break; } - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); - fallthrough; default: return kvm_set_msr_common(vcpu, msr); } @@ -2994,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(vcpu, true); + kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3080,8 +3072,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, - [SVM_EXIT_SMI] = kvm_emulate_as_nop, - [SVM_EXIT_INIT] = kvm_emulate_as_nop, + [SVM_EXIT_SMI] = smi_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, [SVM_EXIT_CPUID] = kvm_emulate_cpuid, @@ -3243,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } -static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]) - return 0; + return (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]); +} +static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) +{ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3256,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - - return -EINVAL; + return 0; } int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(vcpu, exit_code)) - return 0; + if (!svm_check_exit_valid(vcpu, exit_code)) + return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3547,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - svm_toggle_avic_for_irq_window(vcpu, false); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3782,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4288,6 +4282,7 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_host_map map_save; int ret; if (is_guest_mode(vcpu)) { @@ -4303,6 +4298,29 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) ret = nested_svm_vmexit(svm); if (ret) return ret; + + /* + * KVM uses VMCB01 to store L1 host state while L2 runs but + * VMCB01 is going to be used during SMM and thus the state will + * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save + * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the + * format of the area is identical to guest save area offsetted + * by 0x400 (matches the offset of 'struct vmcb_save_area' + * within 'struct vmcb'). Note: HSAVE area may also be used by + * L1 hypervisor to save additional host context (e.g. KVM does + * that, see svm_prepare_guest_switch()) which must be + * preserved. + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; + + BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); + + svm_copy_vmrun_state(map_save.hva + 0x400, + &svm->vmcb01.ptr->save); + + kvm_vcpu_unmap(vcpu, &map_save, true); } return 0; } @@ -4310,13 +4328,14 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); - struct kvm_host_map map; + struct kvm_host_map map, map_save; int ret = 0; if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); + struct vmcb *vmcb12; if (guest) { if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) @@ -4332,8 +4351,25 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (svm_allocate_nested(svm)) return 1; - ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, map.hva); + vmcb12 = map.hva; + + nested_load_control_from_vmcb12(svm, &vmcb12->control); + + ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12); kvm_vcpu_unmap(vcpu, &map, true); + + /* + * Restore L1 host state from L1 HSAVE area as VMCB01 was + * used during SMM (see svm_enter_smm()) + */ + if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), + &map_save) == -EINVAL) + return 1; + + svm_copy_vmrun_state(&svm->vmcb01.ptr->save, + map_save.hva + 0x400); + + kvm_vcpu_unmap(vcpu, &map_save, true); } } @@ -4542,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f89b623bb591..524d943f3efc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -31,6 +31,7 @@ #define MSRPM_OFFSETS 16 extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; +extern bool intercept_smi; /* * Clean bits in VMCB. @@ -463,7 +464,9 @@ void svm_leave_nested(struct vcpu_svm *svm); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct kvm_vcpu *vcpu); -void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); +void svm_copy_vmrun_state(struct vmcb_save_area *to_save, + struct vmcb_save_area *from_save); +void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb); int nested_svm_vmexit(struct vcpu_svm *svm); static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) @@ -479,6 +482,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); +void nested_load_control_from_vmcb12(struct vcpu_svm *svm, + struct vmcb_control_area *control); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm); void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb); @@ -498,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); -} - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -519,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -529,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu); void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h index 9b9a55abc29f..c53b8bf8d013 100644 --- a/arch/x86/kvm/svm/svm_onhyperv.h +++ b/arch/x86/kvm/svm/svm_onhyperv.h @@ -89,7 +89,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments( * as we mark it dirty unconditionally towards end of vcpu * init phase. */ - if (vmcb && vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && + if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) && hve->hv_enlightenments_control.msr_bitmap) vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS); } diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 8170f2a5a16f..22e2b019de37 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -4,7 +4,7 @@ #include <linux/compiler_types.h> -#include <asm/kvm_host.h> +#include "x86.h" #define svm_asm(insn, clobber...) \ do { \ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b484141ea15b..03ebe368333e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -92,6 +92,21 @@ TRACE_EVENT(kvm_hv_hypercall, __entry->outgpa) ); +TRACE_EVENT(kvm_hv_hypercall_done, + TP_PROTO(u64 result), + TP_ARGS(result), + + TP_STRUCT__entry( + __field(__u64, result) + ), + + TP_fast_assign( + __entry->result = result; + ), + + TP_printk("result 0x%llx", __entry->result) +); + /* * Tracepoint for Xen hypercall. */ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 896b2a50b4aa..0dab1b7b529f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #if IS_ENABLED(CONFIG_HYPERV) -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 2ec9b46f0d0c..152ab0aa82cf 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -73,8 +73,6 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -#undef ROL16 - static inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..ccb03d69546c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, + gpa_t addr) +{ + uint i; + struct kvm_mmu_root_info *cached_root; + + WARN_ON_ONCE(!mmu_is_nested(vcpu)); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + cached_root = &vcpu->arch.mmu->prev_roots[i]; + + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, + eptp)) + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + } +} + static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } else { + if (fault->error_code & PFERR_RSVD_MASK) + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + + /* + * Although the caller (kvm_inject_emulated_page_fault) would + * have already synced the faulting address in the shadow EPT + * tables for the current EPTP12, we also need to sync it for + * any other cached EPTP02s based on the same EP4TA, since the + * TLB associates mappings to the EP4TA rather than the full EPTP. + */ + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, + fault->address); + } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; @@ -2170,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, } } -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); @@ -2181,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; @@ -2234,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; + exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | @@ -2245,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING); + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_DESC); + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2285,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= vmcs12->vm_entry_controls; + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; @@ -2302,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmx_vmexit_ctrl(); + exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* @@ -3347,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - prepare_vmcs02_early(vmx, vmcs12); + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { @@ -4267,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4278,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4296,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4382,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation @@ -5325,14 +5366,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -#define EPTP_PA_MASK GENMASK_ULL(51, 12) - -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) -{ - return VALID_PAGE(root_hpa) && - ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); -} - /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { @@ -5826,7 +5859,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) - return vcpu->arch.apf.host_apf_flags || !enable_ept; + return vcpu->arch.apf.host_apf_flags || + vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4b9957e2bf5b..6e5de2e2b0da 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,6 +11,8 @@ #include "capabilities.h" +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + struct vmcs_hdr { u32 revision_id:31; u32 shadow_vmcs:1; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index d9f5d7c56ae3..cab6ba7a5005 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -2,7 +2,6 @@ #include "vmcs12.h" -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5e0e1b39f495..2a45f026ee11 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static inline short vmcs_field_to_offset(unsigned long field) { unsigned short offset; @@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field) return offset; } -#undef ROL16 - static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..0c2c0d5ae873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } + vmcs_writel(CR0_READ_SHADOW, cr0); + vmcs_writel(GUEST_CR0, hw_cr0); + vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) exit_lmode(vcpu); } #endif - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - vmcs_writel(CR0_READ_SHADOW, cr0); - vmcs_writel(GUEST_CR0, hw_cr0); - vcpu->arch.cr0 = cr0; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); @@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } -u32 vmx_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; - vmx->secondary_exec_control = exec_control; + return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 @@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_set(vmx, vmx->secondary_exec_control); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) @@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->emulation_required) + return; + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) @@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); @@ -6997,7 +7017,7 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 new_ctl = vmx->secondary_exec_control; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); @@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx); - } + vmx_setup_uret_msrs(vmx); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -7803,7 +7823,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 3979a947933a..4858c5fd95f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -14,8 +14,6 @@ #include "vmx_ops.h" #include "cpuid.h" -extern const u32 vmx_msr_index[]; - #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 #define MSR_TYPE_RW 3 @@ -229,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for @@ -265,8 +263,6 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 secondary_exec_control; - /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -373,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); @@ -421,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ +static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->loaded_vmcs->controls_shadow.lname; \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ @@ -453,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; } -static inline u32 vmx_vmentry_ctrl(void) -{ - u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (vmx_pt_mode_is_system()) - vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static inline u32 vmx_vmexit_ctrl(void) -{ - u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (vmx_pt_mode_is_system()) - vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -u32 vmx_exec_control(struct vcpu_vmx *vmx); -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); - static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); @@ -524,7 +498,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow) static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { - return vmx->secondary_exec_control & + return secondary_exec_controls_get(vmx) & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 164b64f65a8f..9e9ef47e988c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include <linux/nospec.h> -#include <asm/kvm_host.h> #include <asm/vmx.h> #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6dc1b445231..28ef14155726 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -3407,7 +3416,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { vcpu->arch.apf.pageready_pending = false; @@ -3746,7 +3755,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; @@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -4358,8 +4361,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { - return kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_accept_dm_intr(vcpu); + /* + * Do not cause an interrupt window exit if an exception + * is pending or an event needs reinjection; userspace + * might want to inject the interrupt manually using KVM_SET_REGS + * or KVM_SET_SREGS. For that to work, we must be at an + * instruction boundary and with no events half-injected. + */ + return (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_accept_dm_intr(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + !vcpu->arch.exception.pending); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -6558,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -8569,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { + mutex_init(&kvm->arch.apicv_update_lock); + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); @@ -8882,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit @@ -9227,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + activate = kvm_apicv_activated(vcpu->kvm); + if (vcpu->arch.apicv_active == activate) + goto out; + + vcpu->arch.apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9242,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) */ if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; - unsigned long old, new, expected; + unsigned long old, new; if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); - - if (!!old == !!new) - return; + old = new = kvm->arch.apicv_inhibit_reasons; - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + + if (!!old != !!new) { + trace_kvm_apicv_update_request(activate, bit); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + } + } else + kvm->arch.apicv_inhibit_reasons = new; +} +EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + mutex_lock(&kvm->arch.apicv_update_lock); + __kvm_request_apicv_update(kvm, activate, bit); + mutex_unlock(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9386,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -9599,8 +9620,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); } for (;;) { @@ -9628,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -9965,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10570,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -10788,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10854,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the @@ -10868,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -10985,9 +11049,6 @@ int kvm_arch_hardware_setup(void *opaque) int r; rdmsrl_safe(MSR_EFER, &host_efer); - if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) && - !(host_efer & EFER_NX))) - return -EIO; if (boot_cpu_has(X86_FEATURE_XSAVES)) rdmsrl(MSR_IA32_XSS, host_xss); @@ -11115,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } @@ -11304,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); WARN_ON(slot->arch.rmap[i]); @@ -11388,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) @@ -11473,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11553,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44ae10312740..7d66d63dc55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,8 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +void kvm_spurious_fault(void); + static __always_inline void kvm_guest_enter_irqoff(void) { /* diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } |