diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2022-07-22 03:14:32 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2022-07-22 03:14:32 -0400 |
commit | a4850b5590d01bf3fb19fda3fc5d433f7382a974 (patch) | |
tree | d8c69909ebb8e43410cbab3b3988077ec824bc06 | |
parent | 8031d87aa9953ddeb047a5356ebd0b240c30f233 (diff) | |
parent | f5ecfee94493475783074e86ded10a0499d779fc (diff) | |
download | linux-a4850b5590d01bf3fb19fda3fc5d433f7382a974.tar.gz |
Merge tag 'kvm-s390-next-5.20-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD
KVM: s390x: Fixes and features for 5.20
* First part of deferred teardown
* CPU Topology
* interpretive execution for PCI instructions
* PV attestation
* Minor fixes
51 files changed, 1944 insertions, 172 deletions
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index cd9361f22530..bf1ef987e3d0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5955,6 +5955,52 @@ KVM_PV_DUMP_CPU Provides encrypted dump data like register values. The length of the returned data is provided by uv_info.guest_cpu_stor_len. +4.137 KVM_S390_ZPCI_OP +---------------------- + +:Capability: KVM_CAP_S390_ZPCI_OP +:Architectures: s390 +:Type: vm ioctl +:Parameters: struct kvm_s390_zpci_op (in) +:Returns: 0 on success, <0 on error + +Used to manage hardware-assisted virtualization features for zPCI devices. + +Parameters are specified via the following structure:: + + struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; + }; + +The type of operation is specified in the "op" field. +KVM_S390_ZPCIOP_REG_AEN is used to register the VM for adapter event +notification interpretation, which will allow firmware delivery of adapter +events directly to the vm, with KVM providing a backup delivery mechanism; +KVM_S390_ZPCIOP_DEREG_AEN is used to subsequently disable interpretation of +adapter event notifications. + +The target zPCI function must also be specified via the "fh" field. For the +KVM_S390_ZPCIOP_REG_AEN operation, additional information to establish firmware +delivery must be provided via the "reg_aen" struct. + +The "pad" and "reserved" fields may be used for future extensions and should be +set to 0s by userspace. 5. The kvm_run structure ======================== @@ -8223,6 +8269,31 @@ The capability has no effect if the nx_huge_pages module parameter is not set. This capability may only be set before any vCPUs are created. +8.39 KVM_CAP_S390_CPU_TOPOLOGY +------------------------------ + +:Capability: KVM_CAP_S390_CPU_TOPOLOGY +:Architectures: s390 +:Type: vm + +This capability indicates that KVM will provide the S390 CPU Topology +facility which consist of the interpretation of the PTF instruction for +the function code 2 along with interception and forwarding of both the +PTF instruction with function codes 0 or 1 and the STSI(15,1,x) +instruction to the userland hypervisor. + +The stfle facility 11, CPU Topology facility, should not be indicated +to the guest without this capability. + +When this capability is present, KVM provides a new attribute group +on vm fd, KVM_S390_VM_CPU_TOPOLOGY. +This new attribute allows to get, set or clear the Modified Change +Topology Report (MTCR) bit of the SCA through the kvm_device_attr +structure. + +When getting the Modified Change Topology Report value, the attr->addr +must point to a byte where the value will be stored or retrieved from. + 9. Known KVM API problems ========================= diff --git a/MAINTAINERS b/MAINTAINERS index e549a84e21c8..2dbc2031df2e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17453,6 +17453,7 @@ M: Eric Farman <farman@linux.ibm.com> L: linux-s390@vger.kernel.org L: kvm@vger.kernel.org S: Supported +F: arch/s390/kvm/pci* F: drivers/vfio/pci/vfio_pci_zdev.c F: include/uapi/linux/vfio_zdev.h diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index 67c737c1e580..a5fa667160b2 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -45,6 +45,8 @@ void uv_query_info(void) uv_info.supp_se_hdr_pcf = uvcb.supp_se_hdr_pcf; uv_info.conf_dump_storage_state_len = uvcb.conf_dump_storage_state_len; uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; + uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; + uv_info.supp_att_pflags = uvcb.supp_att_pflags; } #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 01936fdfaddb..e82e5626e139 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -12,10 +12,11 @@ #include <linux/bit_spinlock.h> #include <linux/dma-mapping.h> +#include <asm/tpi.h> struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *airq, bool floating); + void (*handler)(struct airq_struct *airq, struct tpi_info *tpi_info); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ @@ -46,8 +47,10 @@ struct airq_iv { #define AIRQ_IV_PTR 4 /* Allocate the ptr array */ #define AIRQ_IV_DATA 8 /* Allocate the data array */ #define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ +#define AIRQ_IV_GUESTVEC 32 /* Vector is a pinned guest page */ -struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec); void airq_iv_release(struct airq_iv *iv); unsigned long airq_iv_alloc(struct airq_iv *iv, unsigned long num); void airq_iv_free(struct airq_iv *iv, unsigned long bit, unsigned long num); diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 40264f60b0da..5cc46e0dde62 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -147,5 +147,42 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); int gmap_mark_unmergeable(void); -void s390_reset_acc(struct mm_struct *mm); +void s390_unlist_old_asce(struct gmap *gmap); +int s390_replace_asce(struct gmap *gmap); +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns); +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible); + +/** + * s390_uv_destroy_range - Destroy a range of pages in the given mm. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls, but + * it will otherwise only return when it completed. + */ +static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + (void)__s390_uv_destroy_range(mm, start, end, false); +} + +/** + * s390_uv_destroy_range_interruptible - Destroy a range of pages in the + * given mm, but stop when a fatal signal is received. + * @mm: the mm on which to operate on + * @start: the start of the range + * @end: the end of the range + * + * This function will call cond_sched, so it should not generate stalls. If + * a fatal signal is received, it will return with -EINTR immediately, + * without finishing destroying the whole range. Upon successful + * completion, 0 is returned. + */ +static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + return __s390_uv_destroy_range(mm, start, end, true); +} #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index a0fbe4820e0a..f39092e0ceaa 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -19,6 +19,8 @@ #include <linux/kvm.h> #include <linux/seqlock.h> #include <linux/module.h> +#include <linux/pci.h> +#include <linux/mmu_notifier.h> #include <asm/debug.h> #include <asm/cpu.h> #include <asm/fpu/api.h> @@ -93,19 +95,30 @@ union ipte_control { }; }; +union sca_utility { + __u16 val; + struct { + __u16 mtcr : 1; + __u16 reserved : 15; + }; +}; + struct bsca_block { union ipte_control ipte_control; __u64 reserved[5]; __u64 mcn; - __u64 reserved2; + union sca_utility utility; + __u8 reserved2[6]; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; }; struct esca_block { union ipte_control ipte_control; - __u64 reserved1[7]; + __u64 reserved1[6]; + union sca_utility utility; + __u8 reserved2[6]; __u64 mcn[4]; - __u64 reserved2[20]; + __u64 reserved3[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; }; @@ -249,12 +262,16 @@ struct kvm_s390_sie_block { #define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 +#define ECB_PTF 0x01 __u8 ecb; /* 0x0061 */ #define ECB2_CMMA 0x80 #define ECB2_IEP 0x20 #define ECB2_PFMFI 0x08 #define ECB2_ESCA 0x04 +#define ECB2_ZPCI_LSI 0x02 __u8 ecb2; /* 0x0062 */ +#define ECB3_AISI 0x20 +#define ECB3_AISII 0x10 #define ECB3_DEA 0x08 #define ECB3_AES 0x04 #define ECB3_RI 0x01 @@ -759,6 +776,7 @@ struct kvm_vm_stat { u64 inject_pfault_done; u64 inject_service_signal; u64 inject_virtio; + u64 aen_forward; }; struct kvm_arch_memory_slot { @@ -924,6 +942,7 @@ struct kvm_s390_pv { unsigned long stor_base; void *stor_var; bool dumping; + struct mmu_notifier mmu_notifier; }; struct kvm_arch{ @@ -940,6 +959,7 @@ struct kvm_arch{ int use_cmma; int use_pfmfi; int use_skf; + int use_zpci_interp; int user_cpu_state_ctrl; int user_sigp; int user_stsi; @@ -963,6 +983,8 @@ struct kvm_arch{ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; + struct list_head kzdev_list; + spinlock_t kzdev_list_lock; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -1013,4 +1035,19 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +#define __KVM_HAVE_ARCH_VM_FREE +void kvm_arch_free_vm(struct kvm *kvm); + +#ifdef CONFIG_VFIO_PCI_ZDEV_KVM +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm); +void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev); +#else +static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev, + struct kvm *kvm) +{ + return -EPERM; +} +static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {} +#endif + #endif diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 82aae78e1315..1572b3634cdd 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -18,7 +18,7 @@ typedef struct { unsigned long asce_limit; unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ - atomic_t is_protected; + atomic_t protected_count; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index c7937f369e62..2a38af5a00c2 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -26,7 +26,7 @@ static inline int init_new_context(struct task_struct *tsk, INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); - atomic_set(&mm->context.is_protected, 0); + atomic_set(&mm->context.protected_count, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; #ifdef CONFIG_PGSTE diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index fdb9745ee998..85eb0ef9d4c3 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -9,6 +9,7 @@ #include <asm-generic/pci.h> #include <asm/pci_clp.h> #include <asm/pci_debug.h> +#include <asm/pci_insn.h> #include <asm/sclp.h> #define PCIBIOS_MIN_IO 0x1000 @@ -97,6 +98,7 @@ struct zpci_bar_struct { }; struct s390_domain; +struct kvm_zdev; #define ZPCI_FUNCTIONS_PER_BUS 256 struct zpci_bus { @@ -123,11 +125,14 @@ struct zpci_dev { enum zpci_state state; u32 fid; /* function ID, used by sclp */ u32 fh; /* function handle, used by insn's */ + u32 gisa; /* GISA designation for passthrough */ u16 vfn; /* virtual function number */ u16 pchid; /* physical channel ID */ + u16 maxstbl; /* Maximum store block size */ u8 pfgid; /* function group ID */ u8 pft; /* pci function type */ u8 port; + u8 dtsm; /* Supported DT mask */ u8 rid_available : 1; u8 has_hp_slot : 1; u8 has_resources : 1; @@ -186,7 +191,10 @@ struct zpci_dev { struct dentry *debugfs_dev; + /* IOMMU and passthrough */ struct s390_domain *s390_domain; /* s390 IOMMU domain data */ + struct kvm_zdev *kzdev; + struct mutex kzdev_lock; }; static inline bool zdev_enabled(struct zpci_dev *zdev) @@ -198,6 +206,9 @@ extern const struct attribute_group *zpci_attr_groups[]; extern unsigned int s390_pci_force_floating __initdata; extern unsigned int s390_pci_no_rid; +extern union zpci_sic_iib *zpci_aipb; +extern struct airq_iv *zpci_aif_sbv; + /* ----------------------------------------------------------------------------- Prototypes ----------------------------------------------------------------------------- */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index 1f4b666e85ee..d6189ed14f84 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -153,9 +153,11 @@ struct clp_rsp_query_pci_grp { u8 : 6; u8 frame : 1; u8 refresh : 1; /* TLB refresh mode */ - u16 reserved2; + u16 : 3; + u16 maxstbl : 13; /* Maximum store block size */ u16 mui; - u16 : 16; + u8 dtsm; /* Supported DT mask */ + u8 reserved3; u16 maxfaal; u16 : 4; u16 dnoi : 12; @@ -173,7 +175,8 @@ struct clp_req_set_pci { u16 reserved2; u8 oc; /* operation controls */ u8 ndas; /* number of dma spaces */ - u64 reserved3; + u32 reserved3; + u32 gisa; /* GISA designation */ } __packed; /* Set PCI function response */ diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 61cf9531f68f..e5f57cfe1d45 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -98,6 +98,15 @@ struct zpci_fib { u32 gd; } __packed __aligned(8); +/* Set Interruption Controls Operation Controls */ +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_SET_AENI_CONTROLS 2 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + /* directed interruption information block */ struct zpci_diib { u32 : 1; @@ -119,9 +128,20 @@ struct zpci_cdiib { u64 : 64; } __packed __aligned(8); +/* adapter interruption parameters block */ +struct zpci_aipb { + u64 faisb; + u64 gait; + u16 : 13; + u16 afi : 3; + u32 : 32; + u16 faal; +} __packed __aligned(8); + union zpci_sic_iib { struct zpci_diib diib; struct zpci_cdiib cdiib; + struct zpci_aipb aipb; }; DECLARE_STATIC_KEY_FALSE(have_mio); @@ -134,13 +154,6 @@ int __zpci_store(u64 data, u64 req, u64 offset); int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); int __zpci_store_block(const u64 *data, u64 req, u64 offset); void zpci_barrier(void); -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); - -static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) -{ - union zpci_sic_iib iib = {{0}}; - - return __zpci_set_irq_ctrl(ctl, isc, &iib); -} +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a397b072a580..cf81acf3879c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -525,7 +525,7 @@ static inline int mm_has_pgste(struct mm_struct *mm) static inline int mm_is_protected(struct mm_struct *mm) { #ifdef CONFIG_PGSTE - if (unlikely(atomic_read(&mm->context.is_protected))) + if (unlikely(atomic_read(&mm->context.protected_count))) return 1; #endif return 0; @@ -1182,9 +1182,22 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, } else { res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - /* At this point the reference through the mapping is still present */ - if (mm_is_protected(mm) && pte_present(res)) - uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); + /* Nothing to do */ + if (!mm_is_protected(mm) || !pte_present(res)) + return res; + /* + * At this point the reference through the mapping is still present. + * The notifier should have destroyed all protected vCPUs at this + * point, so the destroy should be successful. + */ + if (full && !uv_destroy_owned_page(pte_val(res) & PAGE_MASK)) + return res; + /* + * If something went wrong and the page could not be destroyed, or + * if this is not a mm teardown, the slower export is used as + * fallback instead. + */ + uv_convert_owned_from_secure(pte_val(res) & PAGE_MASK); return res; } diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 236b34b75ddb..addefe8ccdba 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -88,6 +88,10 @@ struct sclp_info { unsigned char has_sipl : 1; unsigned char has_dirq : 1; unsigned char has_iplcc : 1; + unsigned char has_zpci_lsi : 1; + unsigned char has_aisii : 1; + unsigned char has_aeni : 1; + unsigned char has_aisi : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/asm/tpi.h b/arch/s390/include/asm/tpi.h index 1ac538b8cbf5..f76e5fdff23a 100644 --- a/arch/s390/include/asm/tpi.h +++ b/arch/s390/include/asm/tpi.h @@ -19,6 +19,19 @@ struct tpi_info { u32 :12; } __packed __aligned(4); +/* I/O-Interruption Code as stored by TPI for an Adapter I/O */ +struct tpi_adapter_info { + u32 aism:8; + u32 :22; + u32 error:1; + u32 forward:1; + u32 reserved; + u32 adapter_IO:1; + u32 directed_irq:1; + u32 isc:3; + u32 :27; +} __packed __aligned(4); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_TPI_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 3e597bb634bd..be3ef9dd6972 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -124,7 +124,10 @@ struct uv_cb_qui { u64 reservedc0; /* 0x00c0 */ u64 conf_dump_storage_state_len; /* 0x00c8 */ u64 conf_dump_finalize_len; /* 0x00d0 */ - u8 reservedd8[256 - 216]; /* 0x00d8 */ + u64 reservedd8; /* 0x00d8 */ + u64 supp_att_req_hdr_ver; /* 0x00e0 */ + u64 supp_att_pflags; /* 0x00e8 */ + u8 reservedf0[256 - 240]; /* 0x00f0 */ } __packed __aligned(8); /* Initialize Ultravisor */ @@ -350,6 +353,8 @@ struct uv_info { unsigned long supp_se_hdr_pcf; unsigned long conf_dump_storage_state_len; unsigned long conf_dump_finalize_len; + unsigned long supp_att_req_hdr_ver; + unsigned long supp_att_pflags; }; extern struct uv_info uv_info; @@ -421,6 +426,7 @@ static inline int is_prot_virt_host(void) } int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr); int uv_destroy_owned_page(unsigned long paddr); int uv_convert_from_secure(unsigned long paddr); int uv_convert_owned_from_secure(unsigned long paddr); diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 7a6b14874d65..a73cf01a1606 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -74,6 +74,7 @@ struct kvm_s390_io_adapter_req { #define KVM_S390_VM_CRYPTO 2 #define KVM_S390_VM_CPU_MODEL 3 #define KVM_S390_VM_MIGRATION 4 +#define KVM_S390_VM_CPU_TOPOLOGY 5 /* kvm attributes for mem_ctrl */ #define KVM_S390_VM_MEM_ENABLE_CMMA 0 diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 84fe33b6af4d..f9810d2a267c 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -234,6 +234,32 @@ static int make_secure_pte(pte_t *ptep, unsigned long addr, return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; } +/** + * should_export_before_import - Determine whether an export is needed + * before an import-like operation + * @uvcb: the Ultravisor control block of the UVC to be performed + * @mm: the mm of the process + * + * Returns whether an export is needed before every import-like operation. + * This is needed for shared pages, which don't trigger a secure storage + * exception when accessed from a different guest. + * + * Although considered as one, the Unpin Page UVC is not an actual import, + * so it is not affected. + * + * No export is needed also when there is only one protected VM, because the + * page cannot belong to the wrong VM in that case (there is no "other VM" + * it can belong to). + * + * Return: true if an export is needed before every import, otherwise false. + */ +static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) +{ + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) + return false; + return atomic_read(&mm->context.protected_count) > 1; +} + /* * Requests the Ultravisor to make a page accessible to a guest. * If it's brought in the first time, it will be cleared. If @@ -277,6 +303,8 @@ again: lock_page(page); ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + if (should_export_before_import(uvcb, gmap->mm)) + uv_convert_from_secure(page_to_phys(page)); rc = make_secure_pte(ptep, uaddr, page, uvcb); pte_unmap_unlock(ptep, ptelock); unlock_page(page); @@ -334,6 +362,61 @@ int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) } EXPORT_SYMBOL_GPL(gmap_convert_to_secure); +/** + * gmap_destroy_page - Destroy a guest page. + * @gmap: the gmap of the guest + * @gaddr: the guest address to destroy + * + * An attempt will be made to destroy the given guest page. If the attempt + * fails, an attempt is made to export the page. If both attempts fail, an + * appropriate error is returned. + */ +int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr) +{ + struct vm_area_struct *vma; + unsigned long uaddr; + struct page *page; + int rc; + + rc = -EFAULT; + mmap_read_lock(gmap->mm); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = vma_lookup(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Huge pages should not be able to become secure + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = 0; + /* we take an extra reference here */ + page = follow_page(vma, uaddr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + rc = uv_destroy_owned_page(page_to_phys(page)); + /* + * Fault handlers can race; it is possible that two CPUs will fault + * on the same secure page. One CPU can destroy the page, reboot, + * re-enter secure mode and import it, while the second CPU was + * stuck at the beginning of the handler. At some point the second + * CPU will be able to progress, and it will not be able to destroy + * the page. In that case we do not want to terminate the process, + * we instead try to export the page. + */ + if (rc) + rc = uv_convert_owned_from_secure(page_to_phys(page)); + put_page(page); +out: + mmap_read_unlock(gmap->mm); + return rc; +} +EXPORT_SYMBOL_GPL(gmap_destroy_page); + /* * To be called with the page locked or with an extra reference! This will * prevent gmap_make_secure from touching the page concurrently. Having 2 @@ -479,6 +562,24 @@ static ssize_t uv_query_max_guest_addr(struct kobject *kobj, static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); +static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver); +} + +static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = + __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); + +static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags); +} + +static struct kobj_attribute uv_query_supp_att_pflags_attr = + __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, @@ -490,6 +591,8 @@ static struct attribute *uv_query_attrs[] = { &uv_query_dump_storage_state_len_attr.attr, &uv_query_dump_finalize_len_attr.attr, &uv_query_dump_cpu_len_attr.attr, + &uv_query_supp_att_req_hdr_ver_attr.attr, + &uv_query_supp_att_pflags_attr.attr, NULL, }; diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 2e84d3922f7c..33f4ff909476 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -34,6 +34,7 @@ config KVM select SRCU select KVM_VFIO select INTERVAL_TREE + select MMU_NOTIFIER help Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 26f4a74e5ce4..02217fb4ae10 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -10,4 +10,5 @@ ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o +kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 227ed0009354..082ec5f2c3a5 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -262,77 +262,77 @@ struct aste { /* .. more fields there */ }; -int ipte_lock_held(struct kvm_vcpu *vcpu) +int ipte_lock_held(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) { + if (sclp.has_siif) { int rc; - read_lock(&vcpu->kvm->arch.sca_lock); - rc = kvm_s390_get_ipte_control(vcpu->kvm)->kh != 0; - read_unlock(&vcpu->kvm->arch.sca_lock); + read_lock(&kvm->arch.sca_lock); + rc = kvm_s390_get_ipte_control(kvm)->kh != 0; + read_unlock(&kvm->arch.sca_lock); return rc; } - return vcpu->kvm->arch.ipte_lock_count != 0; + return kvm->arch.ipte_lock_count != 0; } -static void ipte_lock_simple(struct kvm_vcpu *vcpu) +static void ipte_lock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count++; - if (vcpu->kvm->arch.ipte_lock_count > 1) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count++; + if (kvm->arch.ipte_lock_count > 1) goto out; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.k) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } new = old; new.k = 1; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_unlock_simple(struct kvm_vcpu *vcpu) +static void ipte_unlock_simple(struct kvm *kvm) { union ipte_control old, new, *ic; - mutex_lock(&vcpu->kvm->arch.ipte_mutex); - vcpu->kvm->arch.ipte_lock_count--; - if (vcpu->kvm->arch.ipte_lock_count) + mutex_lock(&kvm->arch.ipte_mutex); + kvm->arch.ipte_lock_count--; + if (kvm->arch.ipte_lock_count) goto out; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); - wake_up(&vcpu->kvm->arch.ipte_wq); + read_unlock(&kvm->arch.sca_lock); + wake_up(&kvm->arch.ipte_wq); out: - mutex_unlock(&vcpu->kvm->arch.ipte_mutex); + mutex_unlock(&kvm->arch.ipte_mutex); } -static void ipte_lock_siif(struct kvm_vcpu *vcpu) +static void ipte_lock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; retry: - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); if (old.kg) { - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); cond_resched(); goto retry; } @@ -340,15 +340,15 @@ retry: new.k = 1; new.kh++; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); } -static void ipte_unlock_siif(struct kvm_vcpu *vcpu) +static void ipte_unlock_siif(struct kvm *kvm) { union ipte_control old, new, *ic; - read_lock(&vcpu->kvm->arch.sca_lock); - ic = kvm_s390_get_ipte_control(vcpu->kvm); + read_lock(&kvm->arch.sca_lock); + ic = kvm_s390_get_ipte_control(kvm); do { old = READ_ONCE(*ic); new = old; @@ -356,25 +356,25 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu) if (!new.kh) new.k = 0; } while (cmpxchg(&ic->val, old.val, new.val) != old.val); - read_unlock(&vcpu->kvm->arch.sca_lock); + read_unlock(&kvm->arch.sca_lock); if (!new.kh) - wake_up(&vcpu->kvm->arch.ipte_wq); + wake_up(&kvm->arch.ipte_wq); } -void ipte_lock(struct kvm_vcpu *vcpu) +void ipte_lock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_lock_siif(vcpu); + if (sclp.has_siif) + ipte_lock_siif(kvm); else - ipte_lock_simple(vcpu); + ipte_lock_simple(kvm); } -void ipte_unlock(struct kvm_vcpu *vcpu) +void ipte_unlock(struct kvm *kvm) { - if (vcpu->arch.sie_block->eca & ECA_SII) - ipte_unlock_siif(vcpu); + if (sclp.has_siif) + ipte_unlock_siif(kvm); else - ipte_unlock_simple(vcpu); + ipte_unlock_simple(kvm); } static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar, @@ -1086,7 +1086,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, try_storage_prot_override = storage_prot_override_applicable(vcpu); need_ipte_lock = psw_bits(*psw).dat && !asce.r; if (need_ipte_lock) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); /* * Since we do the access further down ultimately via a move instruction * that does key checking and returns an error in case of a protection @@ -1127,7 +1127,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, } out_unlock: if (need_ipte_lock) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); if (nr_pages > ARRAY_SIZE(gpa_array)) vfree(gpas); return rc; @@ -1199,10 +1199,10 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar, rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode); if (rc) return rc; - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = guest_range_to_gpas(vcpu, gva, ar, NULL, length, asce, mode, access_key); - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return rc; } @@ -1465,7 +1465,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, * tables/pointers we read stay valid - unshadowing is however * always possible - only guest_table_lock protects us. */ - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); if (rc) @@ -1499,7 +1499,7 @@ shadow_page: pte.p |= dat_protection; if (!rc) rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); mmap_read_unlock(sg->mm); return rc; } diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h index 1124ff282012..9408d6cc8e2c 100644 --- a/arch/s390/kvm/gaccess.h +++ b/arch/s390/kvm/gaccess.h @@ -440,9 +440,9 @@ int read_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data, return access_guest_real(vcpu, gra, data, len, 0); } -void ipte_lock(struct kvm_vcpu *vcpu); -void ipte_unlock(struct kvm_vcpu *vcpu); -int ipte_lock_held(struct kvm_vcpu *vcpu); +void ipte_lock(struct kvm *kvm); +void ipte_unlock(struct kvm *kvm); +int ipte_lock_held(struct kvm *kvm); int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); /* MVPG PEI indication bits */ diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 8bd42a20d924..88112065d941 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -528,12 +528,27 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu) static int handle_pv_notification(struct kvm_vcpu *vcpu) { + int ret; + if (vcpu->arch.sie_block->ipa == 0xb210) return handle_pv_spx(vcpu); if (vcpu->arch.sie_block->ipa == 0xb220) return handle_pv_sclp(vcpu); if (vcpu->arch.sie_block->ipa == 0xb9a4) return handle_pv_uvc(vcpu); + if (vcpu->arch.sie_block->ipa >> 8 == 0xae) { + /* + * Besides external call, other SIGP orders also cause a + * 108 (pv notify) intercept. In contrast to external call, + * these orders need to be emulated and hence the appropriate + * place to handle them is in handle_instruction(). + * So first try kvm_s390_handle_sigp_pei() and if that isn't + * successful, go on with handle_instruction(). + */ + ret = kvm_s390_handle_sigp_pei(vcpu); + if (!ret) + return ret; + } return handle_instruction(vcpu); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index af96dc0549a4..b9c944b262c7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -28,9 +28,11 @@ #include <asm/switch_to.h> #include <asm/nmi.h> #include <asm/airq.h> +#include <asm/tpi.h> #include "kvm-s390.h" #include "gaccess.h" #include "trace-s390.h" +#include "pci.h" #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 @@ -702,7 +704,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) /* * We indicate floating repressible conditions along with * other pending conditions. Channel Report Pending and Channel - * Subsystem damage are the only two and and are indicated by + * Subsystem damage are the only two and are indicated by * bits in mcic and masked in cr14. */ if (test_and_clear_bit(IRQ_PEND_MCHK_REP, &fi->pending_irqs)) { @@ -3311,10 +3313,87 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) +static void aen_host_forward(unsigned long si) { + struct kvm_s390_gisa_interrupt *gi; + struct zpci_gaite *gaite; + struct kvm *kvm; + + gaite = (struct zpci_gaite *)aift->gait + + (si * sizeof(struct zpci_gaite)); + if (gaite->count == 0) + return; + if (gaite->aisb != 0) + set_bit_inv(gaite->aisbo, (unsigned long *)gaite->aisb); + + kvm = kvm_s390_pci_si_to_kvm(aift, si); + if (!kvm) + return; + gi = &kvm->arch.gisa_int; + + if (!(gi->origin->g1.simm & AIS_MODE_MASK(gaite->gisc)) || + !(gi->origin->g1.nimm & AIS_MODE_MASK(gaite->gisc))) { + gisa_set_ipm_gisc(gi->origin, gaite->gisc); + if (hrtimer_active(&gi->timer)) + hrtimer_cancel(&gi->timer); + hrtimer_start(&gi->timer, 0, HRTIMER_MODE_REL); + kvm->stat.aen_forward++; + } +} + +static void aen_process_gait(u8 isc) +{ + bool found = false, first = true; + union zpci_sic_iib iib = {{0}}; + unsigned long si, flags; + + spin_lock_irqsave(&aift->gait_lock, flags); + + if (!aift->gait) { + spin_unlock_irqrestore(&aift->gait_lock, flags); + return; + } + + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(aift->sbv, si, airq_iv_end(aift->sbv)); + if (si == -1UL) { + if (first || found) { + /* Re-enable interrupts. */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, isc, + &iib); + first = found = false; + } else { + /* Interrupts on and all bits processed */ + break; + } + found = false; + si = 0; + /* Scan again after re-enabling interrupts */ + continue; + } + found = true; + aen_host_forward(si); + } + + spin_unlock_irqrestore(&aift->gait_lock, flags); +} + +static void gib_alert_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) +{ + struct tpi_adapter_info *info = (struct tpi_adapter_info *)tpi_info; + inc_irq_stat(IRQIO_GAL); - process_gib_alert_list(); + + if ((info->forward || info->error) && + IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + aen_process_gait(info->isc); + if (info->aism != 0) + process_gib_alert_list(); + } else { + process_gib_alert_list(); + } } static struct airq_struct gib_alert_irq = { @@ -3326,6 +3405,11 @@ void kvm_s390_gib_destroy(void) { if (!gib) return; + if (kvm_s390_pci_interp_allowed() && aift) { + mutex_lock(&aift->aift_lock); + kvm_s390_pci_aen_exit(); + mutex_unlock(&aift->aift_lock); + } chsc_sgib(0); unregister_adapter_interrupt(&gib_alert_irq); free_page((unsigned long)gib); @@ -3363,6 +3447,14 @@ int kvm_s390_gib_init(u8 nisc) goto out_unreg_gal; } + if (kvm_s390_pci_interp_allowed()) { + if (kvm_s390_pci_aen_init(nisc)) { + pr_err("Initializing AEN for PCI failed\n"); + rc = -EIO; + goto out_unreg_gal; + } + } + KVM_EVENT(3, "gib 0x%pK (nisc=%d) initialized", gib, gib->nisc); goto out; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 72bd5c9b9617..edfd4bbd0cba 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -31,6 +31,7 @@ #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/pgtable.h> +#include <linux/mmu_notifier.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> @@ -47,6 +48,7 @@ #include <asm/fpu/api.h> #include "kvm-s390.h" #include "gaccess.h" +#include "pci.h" #define CREATE_TRACE_POINTS #include "trace.h" @@ -63,7 +65,8 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_float_mchk), STATS_DESC_COUNTER(VM, inject_pfault_done), STATS_DESC_COUNTER(VM, inject_service_signal), - STATS_DESC_COUNTER(VM, inject_virtio) + STATS_DESC_COUNTER(VM, inject_virtio), + STATS_DESC_COUNTER(VM, aen_forward) }; const struct kvm_stats_header kvm_vm_stats_header = { @@ -502,6 +505,14 @@ int kvm_arch_init(void *opaque) goto out; } + if (kvm_s390_pci_interp_allowed()) { + rc = kvm_s390_pci_init(); + if (rc) { + pr_err("Unable to allocate AIFT for PCI\n"); + goto out; + } + } + rc = kvm_s390_gib_init(GAL_ISC); if (rc) goto out; @@ -516,6 +527,8 @@ out: void kvm_arch_exit(void) { kvm_s390_gib_destroy(); + if (kvm_s390_pci_interp_allowed()) + kvm_s390_pci_exit(); debug_unregister(kvm_s390_dbf); debug_unregister(kvm_s390_dbf_uv); } @@ -626,6 +639,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) } break; } + case KVM_CAP_S390_ZPCI_OP: + r = kvm_s390_pci_interp_allowed(); + break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = test_facility(11); + break; default: r = 0; } @@ -837,6 +856,20 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) icpt_operexc_on_all_vcpus(kvm); r = 0; break; + case KVM_CAP_S390_CPU_TOPOLOGY: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else if (test_facility(11)) { + set_kvm_facility(kvm->arch.model.fac_mask, 11); + set_kvm_facility(kvm->arch.model.fac_list, 11); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_CPU_TOPOLOGY %s", + r ? "(not available)" : "(success)"); + break; default: r = -EINVAL; break; @@ -1039,6 +1072,42 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static void kvm_s390_vcpu_pci_setup(struct kvm_vcpu *vcpu) +{ + /* Only set the ECB bits after guest requests zPCI interpretation */ + if (!vcpu->kvm->arch.use_zpci_interp) + return; + + vcpu->arch.sie_block->ecb2 |= ECB2_ZPCI_LSI; + vcpu->arch.sie_block->ecb3 |= ECB3_AISII + ECB3_AISI; +} + +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + + lockdep_assert_held(&kvm->lock); + + if (!kvm_s390_pci_interp_allowed()) + return; + + /* + * If host is configured for PCI and the necessary facilities are + * available, turn on interpretation for the life of this guest + */ + kvm->arch.use_zpci_interp = 1; + + kvm_s390_vcpu_block_all(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_s390_vcpu_pci_setup(vcpu); + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } + + kvm_s390_vcpu_unblock_all(kvm); +} + static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) { unsigned long cx; @@ -1711,6 +1780,57 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr) return ret; } +/** + * kvm_s390_update_topology_change_report - update CPU topology change report + * @kvm: guest KVM description + * @val: set or clear the MTCR bit + * + * Updates the Multiprocessor Topology-Change-Report bit to signal + * the guest with a topology change. + * This is only relevant if the topology facility is present. + * + * The SCA version, bsca or esca, doesn't matter as offset is the same. + */ +static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) +{ + union sca_utility new, old; + struct bsca_block *sca; + + read_lock(&kvm->arch.sca_lock); + sca = kvm->arch.sca; + do { + old = READ_ONCE(sca->utility); + new = old; + new.mtcr = val; + } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + read_unlock(&kvm->arch.sca_lock); +} + +static int kvm_s390_set_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + kvm_s390_update_topology_change_report(kvm, !!attr->attr); + return 0; +} + +static int kvm_s390_get_topo_change_indication(struct kvm *kvm, + struct kvm_device_attr *attr) +{ + u8 topo; + + if (!test_kvm_facility(kvm, 11)) + return -ENXIO; + + read_lock(&kvm->arch.sca_lock); + topo = ((struct bsca_block *)kvm->arch.sca)->utility.mtcr; + read_unlock(&kvm->arch.sca_lock); + + return put_user(topo, (u8 __user *)attr->addr); +} + static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { int ret; @@ -1731,6 +1851,9 @@ static int kvm_s390_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_set_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_set_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1756,6 +1879,9 @@ static int kvm_s390_vm_get_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = kvm_s390_vm_get_migration(kvm, attr); break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = kvm_s390_get_topo_change_indication(kvm, attr); + break; default: ret = -ENXIO; break; @@ -1829,6 +1955,9 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_MIGRATION: ret = 0; break; + case KVM_S390_VM_CPU_TOPOLOGY: + ret = test_kvm_facility(kvm, 11) ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -2186,12 +2315,25 @@ out: return r; } -static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +/** + * kvm_s390_cpus_from_pv - Convert all protected vCPUs in a protected VM to + * non protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Does not stop in case of error, tries to convert as many + * CPUs as possible. In case of error, the RC and RRC of the last error are + * returned. + * + * Return: 0 in case of success, otherwise -EIO + */ +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { struct kvm_vcpu *vcpu; - u16 rc, rrc; - int ret = 0; unsigned long i; + u16 _rc, _rrc; + int ret = 0; /* * We ignore failures and try to destroy as many CPUs as possible. @@ -2203,9 +2345,9 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) */ kvm_for_each_vcpu(i, vcpu, kvm) { mutex_lock(&vcpu->mutex); - if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { - *rcp = rc; - *rrcp = rrc; + if (kvm_s390_pv_destroy_cpu(vcpu, &_rc, &_rrc) && !ret) { + *rc = _rc; + *rrc = _rrc; ret = -EIO; } mutex_unlock(&vcpu->mutex); @@ -2216,6 +2358,17 @@ static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) return ret; } +/** + * kvm_s390_cpus_to_pv - Convert all non-protected vCPUs in a protected VM + * to protected. + * @kvm: the VM whose protected vCPUs are to be converted + * @rc: return value for the RC field of the UVC (in case of error) + * @rrc: return value for the RRC field of the UVC (in case of error) + * + * Tries to undo the conversion in case of error. + * + * Return: 0 in case of success, otherwise -EIO + */ static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) { unsigned long i; @@ -2772,6 +2925,19 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_S390_ZPCI_OP: { + struct kvm_s390_zpci_op args; + + r = -EINVAL; + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + break; + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + r = kvm_s390_pci_zpci_op(kvm, &args); + break; + } default: r = -ENOTTY; } @@ -2933,6 +3099,14 @@ static void sca_dispose(struct kvm *kvm) kvm->arch.sca = NULL; } +void kvm_arch_free_vm(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) + kvm_s390_pci_clear_list(kvm); + + __kvm_arch_free_vm(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { gfp_t alloc_flags = GFP_KERNEL_ACCOUNT; @@ -3015,6 +3189,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_crypto_init(kvm); + if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) { + mutex_lock(&kvm->lock); + kvm_s390_pci_init_list(kvm); + kvm_s390_vcpu_pci_enable_interp(kvm); + mutex_unlock(&kvm->lock); + } + mutex_init(&kvm->arch.float_int.ais_lock); spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) @@ -3068,6 +3249,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_is_ucontrol(vcpu->kvm)) sca_del_vcpu(vcpu); + kvm_s390_update_topology_change_report(vcpu->kvm, 1); if (kvm_is_ucontrol(vcpu->kvm)) gmap_remove(vcpu->arch.gmap); @@ -3095,6 +3277,15 @@ void kvm_arch_destroy_vm(struct kvm *kvm) */ if (kvm_s390_pv_get_handle(kvm)) kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + /* + * Remove the mmu notifier only when the whole KVM VM is torn down, + * and only if one was registered to begin with. If the VM is + * currently not protected, but has been previously been protected, + * then it's possible that the notifier is still registered. + */ + if (kvm->arch.pv.mmu_notifier.ops) + mmu_notifier_unregister(&kvm->arch.pv.mmu_notifier, kvm->mm); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) @@ -3461,6 +3652,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) vcpu->arch.sie_block->ecb |= ECB_SRSI; + if (test_kvm_facility(vcpu->kvm, 11)) + vcpu->arch.sie_block->ecb |= ECB_PTF; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; if (!kvm_is_ucontrol(vcpu->kvm)) @@ -3513,6 +3706,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + kvm_s390_vcpu_pci_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); if (kvm_s390_pv_is_protected(vcpu->kvm)) { rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); @@ -3592,6 +3787,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) rc = kvm_s390_vcpu_setup(vcpu); if (rc) goto out_ucontrol_uninit; + + kvm_s390_update_topology_change_report(vcpu->kvm, 1); return 0; out_ucontrol_uninit: diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index dd01d989816f..f6fd668f887e 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -379,6 +379,7 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); +int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); /* implemented in diag.c */ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu); @@ -513,6 +514,16 @@ void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu, void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm); /** + * kvm_s390_vcpu_pci_enable_interp + * + * Set the associated PCI attributes for each vcpu to allow for zPCI Load/Store + * interpretation as well as adapter interruption forwarding. + * + * @kvm: the KVM guest + */ +void kvm_s390_vcpu_pci_enable_interp(struct kvm *kvm); + +/** * diag9c_forwarding_hz * * Set the maximum number of diag9c forwarding per second diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c new file mode 100644 index 000000000000..4946fb7757d6 --- /dev/null +++ b/arch/s390/kvm/pci.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/pci.h> +#include <asm/pci.h> +#include <asm/pci_insn.h> +#include <asm/pci_io.h> +#include <asm/sclp.h> +#include "pci.h" +#include "kvm-s390.h" + +struct zpci_aift *aift; + +static inline int __set_irq_noiib(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return zpci_set_irq_ctrl(ctl, isc, &iib); +} + +void kvm_s390_pci_aen_exit(void) +{ + unsigned long flags; + struct kvm_zdev **gait_kzdev; + + lockdep_assert_held(&aift->aift_lock); + + /* + * Contents of the aipb remain registered for the life of the host + * kernel, the information preserved in zpci_aipb and zpci_aif_sbv + * in case we insert the KVM module again later. Clear the AIFT + * information and free anything not registered with underlying + * firmware. + */ + spin_lock_irqsave(&aift->gait_lock, flags); + gait_kzdev = aift->kzdev; + aift->gait = NULL; + aift->sbv = NULL; + aift->kzdev = NULL; + spin_unlock_irqrestore(&aift->gait_lock, flags); + + kfree(gait_kzdev); +} + +static int zpci_setup_aipb(u8 nisc) +{ + struct page *page; + int size, rc; + + zpci_aipb = kzalloc(sizeof(union zpci_sic_iib), GFP_KERNEL); + if (!zpci_aipb) + return -ENOMEM; + + aift->sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, 0); + if (!aift->sbv) { + rc = -ENOMEM; + goto free_aipb; + } + zpci_aif_sbv = aift->sbv; + size = get_order(PAGE_ALIGN(ZPCI_NR_DEVICES * + sizeof(struct zpci_gaite))); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, size); + if (!page) { + rc = -ENOMEM; + goto free_sbv; + } + aift->gait = (struct zpci_gaite *)page_to_phys(page); + + zpci_aipb->aipb.faisb = virt_to_phys(aift->sbv->vector); + zpci_aipb->aipb.gait = virt_to_phys(aift->gait); + zpci_aipb->aipb.afi = nisc; + zpci_aipb->aipb.faal = ZPCI_NR_DEVICES; + + /* Setup Adapter Event Notification Interpretation */ + if (zpci_set_irq_ctrl(SIC_SET_AENI_CONTROLS, 0, zpci_aipb)) { + rc = -EIO; + goto free_gait; + } + + return 0; + +free_gait: + free_pages((unsigned long)aift->gait, size); +free_sbv: + airq_iv_release(aift->sbv); + zpci_aif_sbv = NULL; +free_aipb: + kfree(zpci_aipb); + zpci_aipb = NULL; + + return rc; +} + +static int zpci_reset_aipb(u8 nisc) +{ + /* + * AEN registration can only happen once per system boot. If + * an aipb already exists then AEN was already registered and + * we can re-use the aipb contents. This can only happen if + * the KVM module was removed and re-inserted. However, we must + * ensure that the same forwarding ISC is used as this is assigned + * during KVM module load. + */ + if (zpci_aipb->aipb.afi != nisc) + return -EINVAL; + + aift->sbv = zpci_aif_sbv; + aift->gait = (struct zpci_gaite *)zpci_aipb->aipb.gait; + + return 0; +} + +int kvm_s390_pci_aen_init(u8 nisc) +{ + int rc = 0; + + /* If already enabled for AEN, bail out now */ + if (aift->gait || aift->sbv) + return -EPERM; + + mutex_lock(&aift->aift_lock); + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev), + GFP_KERNEL); + if (!aift->kzdev) { + rc = -ENOMEM; + goto unlock; + } + + if (!zpci_aipb) + rc = zpci_setup_aipb(nisc); + else + rc = zpci_reset_aipb(nisc); + if (rc) + goto free_zdev; + + /* Enable floating IRQs */ + if (__set_irq_noiib(SIC_IRQ_MODE_SINGLE, nisc)) { + rc = -EIO; + kvm_s390_pci_aen_exit(); + } + + goto unlock; + +free_zdev: + kfree(aift->kzdev); +unlock: + mutex_unlock(&aift->aift_lock); + return rc; +} + +/* Modify PCI: Register floating adapter interruption forwarding */ +static int kvm_zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {}; + u8 status; + + fib.fmt0.isc = zdev->kzdev->fib.fmt0.isc; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = virt_to_phys(zdev->aibv->vector); + fib.fmt0.aibvo = 0; + fib.fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister floating adapter interruption forwarding */ +static int kvm_zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {}; + u8 cc, status; + + fib.gd = zdev->gisa; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static inline void unaccount_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + + if (user) + atomic_long_sub(nr_pages, &user->locked_vm); + if (current->mm) + atomic64_sub(nr_pages, ¤t->mm->pinned_vm); +} + +static inline int account_mem(unsigned long nr_pages) +{ + struct user_struct *user = get_uid(current_user()); + unsigned long page_limit, cur_pages, new_pages; + + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + atomic64_add(nr_pages, ¤t->mm->pinned_vm); + + return 0; +} + +static int kvm_s390_pci_aif_enable(struct zpci_dev *zdev, struct zpci_fib *fib, + bool assist) +{ + struct page *pages[1], *aibv_page, *aisb_page = NULL; + unsigned int msi_vecs, idx; + struct zpci_gaite *gaite; + unsigned long hva, bit; + struct kvm *kvm; + phys_addr_t gaddr; + int rc = 0, gisc, npages, pcount = 0; + + /* + * Interrupt forwarding is only applicable if the device is already + * enabled for interpretation + */ + if (zdev->gisa == 0) + return -EINVAL; + + kvm = zdev->kzdev->kvm; + msi_vecs = min_t(unsigned int, fib->fmt0.noi, zdev->max_msi); + + /* Get the associated forwarding ISC - if invalid, return the error */ + gisc = kvm_s390_gisc_register(kvm, fib->fmt0.isc); + if (gisc < 0) + return gisc; + + /* Replace AIBV address */ + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aibv)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto out; + } + aibv_page = pages[0]; + pcount++; + gaddr = page_to_phys(aibv_page) + (fib->fmt0.aibv & ~PAGE_MASK); + fib->fmt0.aibv = gaddr; + + /* Pin the guest AISB if one was specified */ + if (fib->fmt0.sum == 1) { + idx = srcu_read_lock(&kvm->srcu); + hva = gfn_to_hva(kvm, gpa_to_gfn((gpa_t)fib->fmt0.aisb)); + npages = pin_user_pages_fast(hva, 1, FOLL_WRITE | FOLL_LONGTERM, + pages); + srcu_read_unlock(&kvm->srcu, idx); + if (npages < 1) { + rc = -EIO; + goto unpin1; + } + aisb_page = pages[0]; + pcount++; + } + + /* Account for pinned pages, roll back on failure */ + if (account_mem(pcount)) + goto unpin2; + + /* AISB must be allocated before we can fill in GAITE */ + mutex_lock(&aift->aift_lock); + bit = airq_iv_alloc_bit(aift->sbv); + if (bit == -1UL) + goto unlock; + zdev->aisb = bit; /* store the summary bit number */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | + AIRQ_IV_BITLOCK | + AIRQ_IV_GUESTVEC, + phys_to_virt(fib->fmt0.aibv)); + + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + + /* If assist not requested, host will get all alerts */ + if (assist) + gaite->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + else + gaite->gisa = 0; + + gaite->gisc = fib->fmt0.isc; + gaite->count++; + gaite->aisbo = fib->fmt0.aisbo; + gaite->aisb = virt_to_phys(page_address(aisb_page) + (fib->fmt0.aisb & + ~PAGE_MASK)); + aift->kzdev[zdev->aisb] = zdev->kzdev; + spin_unlock_irq(&aift->gait_lock); + + /* Update guest FIB for re-issue */ + fib->fmt0.aisbo = zdev->aisb & 63; + fib->fmt0.aisb = virt_to_phys(aift->sbv->vector + (zdev->aisb / 64) * 8); + fib->fmt0.isc = gisc; + + /* Save some guest fib values in the host for later use */ + zdev->kzdev->fib.fmt0.isc = fib->fmt0.isc; + zdev->kzdev->fib.fmt0.aibv = fib->fmt0.aibv; + mutex_unlock(&aift->aift_lock); + + /* Issue the clp to setup the irq now */ + rc = kvm_zpci_set_airq(zdev); + return rc; + +unlock: + mutex_unlock(&aift->aift_lock); +unpin2: + if (fib->fmt0.sum == 1) + unpin_user_page(aisb_page); +unpin1: + unpin_user_page(aibv_page); +out: + return rc; +} + +static int kvm_s390_pci_aif_disable(struct zpci_dev *zdev, bool force) +{ + struct kvm_zdev *kzdev = zdev->kzdev; + struct zpci_gaite *gaite; + struct page *vpage = NULL, *spage = NULL; + int rc, pcount = 0; + u8 isc; + + if (zdev->gisa == 0) + return -EINVAL; + + mutex_lock(&aift->aift_lock); + + /* + * If the clear fails due to an error, leave now unless we know this + * device is about to go away (force) -- In that case clear the GAITE + * regardless. + */ + rc = kvm_zpci_clear_airq(zdev); + if (rc && !force) + goto out; + + if (zdev->kzdev->fib.fmt0.aibv == 0) + goto out; + spin_lock_irq(&aift->gait_lock); + gaite = (struct zpci_gaite *)aift->gait + (zdev->aisb * + sizeof(struct zpci_gaite)); + isc = gaite->gisc; + gaite->count--; + if (gaite->count == 0) { + /* Release guest AIBV and AISB */ + vpage = phys_to_page(kzdev->fib.fmt0.aibv); + if (gaite->aisb != 0) + spage = phys_to_page(gaite->aisb); + /* Clear the GAIT entry */ + gaite->aisb = 0; + gaite->gisc = 0; + gaite->aisbo = 0; + gaite->gisa = 0; + aift->kzdev[zdev->aisb] = 0; + /* Clear zdev info */ + airq_iv_free_bit(aift->sbv, zdev->aisb); + airq_iv_release(zdev->aibv); + zdev->aisb = 0; + zdev->aibv = NULL; + } + spin_unlock_irq(&aift->gait_lock); + kvm_s390_gisc_unregister(kzdev->kvm, isc); + kzdev->fib.fmt0.isc = 0; + kzdev->fib.fmt0.aibv = 0; + + if (vpage) { + unpin_user_page(vpage); + pcount++; + } + if (spage) { + unpin_user_page(spage); + pcount++; + } + if (pcount > 0) + unaccount_mem(pcount); +out: + mutex_unlock(&aift->aift_lock); + + return rc; +} + +static int kvm_s390_pci_dev_open(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = kzalloc(sizeof(struct kvm_zdev), GFP_KERNEL); + if (!kzdev) + return -ENOMEM; + + kzdev->zdev = zdev; + zdev->kzdev = kzdev; + + return 0; +} + +static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) +{ + struct kvm_zdev *kzdev; + + kzdev = zdev->kzdev; + WARN_ON(kzdev->zdev != zdev); + zdev->kzdev = NULL; + kfree(kzdev); +} + + +/* + * Register device with the specified KVM. If interpetation facilities are + * available, enable them and let userspace indicate whether or not they will + * be used (specify SHM bit to disable). + */ +int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm) +{ + int rc; + + if (!zdev) + return -EINVAL; + + mutex_lock(&zdev->kzdev_lock); + + if (zdev->kzdev || zdev->gisa != 0 || !kvm) { + mutex_unlock(&zdev->kzdev_lock); + return -EINVAL; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + + rc = kvm_s390_pci_dev_open(zdev); + if (rc) + goto err; + + /* + * If interpretation facilities aren't available, add the device to + * the kzdev list but don't enable for interpretation. + */ + if (!kvm_s390_pci_interp_allowed()) + goto out; + + /* + * If this is the first request to use an interpreted device, make the + * necessary vcpu changes + */ + if (!kvm->arch.use_zpci_interp) + kvm_s390_vcpu_pci_enable_interp(kvm); + + if (zdev_enabled(zdev)) { + rc = zpci_disable_device(zdev); + if (rc) + goto err; + } + + /* + * Store information about the identity of the kvm guest allowed to + * access this device via interpretation to be used by host CLP + */ + zdev->gisa = (u32)virt_to_phys(&kvm->arch.sie_page2->gisa); + + rc = zpci_enable_device(zdev); + if (rc) + goto clear_gisa; + + /* Re-register the IOMMU that was already created */ + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table)); + if (rc) + goto clear_gisa; + +out: + zdev->kzdev->kvm = kvm; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_add_tail(&zdev->kzdev->entry, &kvm->arch.kzdev_list); + spin_unlock(&kvm->arch.kzdev_list_lock); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return 0; + +clear_gisa: + zdev->gisa = 0; +err: + if (zdev->kzdev) + kvm_s390_pci_dev_release(zdev); + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + kvm_put_kvm(kvm); + return rc; +} +EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm); + +void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev) +{ + struct kvm *kvm; + + if (!zdev) + return; + + mutex_lock(&zdev->kzdev_lock); + + if (WARN_ON(!zdev->kzdev)) { + mutex_unlock(&zdev->kzdev_lock); + return; + } + + kvm = zdev->kzdev->kvm; + mutex_lock(&kvm->lock); + + /* + * A 0 gisa means interpretation was never enabled, just remove the + * device from the list. + */ + if (zdev->gisa == 0) + goto out; + + /* Forwarding must be turned off before interpretation */ + if (zdev->kzdev->fib.fmt0.aibv != 0) + kvm_s390_pci_aif_disable(zdev, true); + + /* Remove the host CLP guest designation */ + zdev->gisa = 0; + + if (zdev_enabled(zdev)) { + if (zpci_disable_device(zdev)) + goto out; + } + + if (zpci_enable_device(zdev)) + goto out; + + /* Re-register the IOMMU that was already created */ + zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + virt_to_phys(zdev->dma_table)); + +out: + spin_lock(&kvm->arch.kzdev_list_lock); + list_del(&zdev->kzdev->entry); + spin_unlock(&kvm->arch.kzdev_list_lock); + kvm_s390_pci_dev_release(zdev); + + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + + kvm_put_kvm(kvm); +} +EXPORT_SYMBOL_GPL(kvm_s390_pci_unregister_kvm); + +void kvm_s390_pci_init_list(struct kvm *kvm) +{ + spin_lock_init(&kvm->arch.kzdev_list_lock); + INIT_LIST_HEAD(&kvm->arch.kzdev_list); +} + +void kvm_s390_pci_clear_list(struct kvm *kvm) +{ + /* + * This list should already be empty, either via vfio device closures + * or kvm fd cleanup. + */ + spin_lock(&kvm->arch.kzdev_list_lock); + WARN_ON_ONCE(!list_empty(&kvm->arch.kzdev_list)); + spin_unlock(&kvm->arch.kzdev_list_lock); +} + +static struct zpci_dev *get_zdev_from_kvm_by_fh(struct kvm *kvm, u32 fh) +{ + struct zpci_dev *zdev = NULL; + struct kvm_zdev *kzdev; + + spin_lock(&kvm->arch.kzdev_list_lock); + list_for_each_entry(kzdev, &kvm->arch.kzdev_list, entry) { + if (kzdev->zdev->fh == fh) { + zdev = kzdev->zdev; + break; + } + } + spin_unlock(&kvm->arch.kzdev_list_lock); + + return zdev; +} + +static int kvm_s390_pci_zpci_reg_aen(struct zpci_dev *zdev, + struct kvm_s390_zpci_op *args) +{ + struct zpci_fib fib = {}; + bool hostflag; + + fib.fmt0.aibv = args->u.reg_aen.ibv; + fib.fmt0.isc = args->u.reg_aen.isc; + fib.fmt0.noi = args->u.reg_aen.noi; + if (args->u.reg_aen.sb != 0) { + fib.fmt0.aisb = args->u.reg_aen.sb; + fib.fmt0.aisbo = args->u.reg_aen.sbo; + fib.fmt0.sum = 1; + } else { + fib.fmt0.aisb = 0; + fib.fmt0.aisbo = 0; + fib.fmt0.sum = 0; + } + + hostflag = !(args->u.reg_aen.flags & KVM_S390_ZPCIOP_REGAEN_HOST); + return kvm_s390_pci_aif_enable(zdev, &fib, hostflag); +} + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args) +{ + struct kvm_zdev *kzdev; + struct zpci_dev *zdev; + int r; + + zdev = get_zdev_from_kvm_by_fh(kvm, args->fh); + if (!zdev) + return -ENODEV; + + mutex_lock(&zdev->kzdev_lock); + mutex_lock(&kvm->lock); + + kzdev = zdev->kzdev; + if (!kzdev) { + r = -ENODEV; + goto out; + } + if (kzdev->kvm != kvm) { + r = -EPERM; + goto out; + } + + switch (args->op) { + case KVM_S390_ZPCIOP_REG_AEN: + /* Fail on unknown flags */ + if (args->u.reg_aen.flags & ~KVM_S390_ZPCIOP_REGAEN_HOST) { + r = -EINVAL; + break; + } + r = kvm_s390_pci_zpci_reg_aen(zdev, args); + break; + case KVM_S390_ZPCIOP_DEREG_AEN: + r = kvm_s390_pci_aif_disable(zdev, false); + break; + default: + r = -EINVAL; + } + +out: + mutex_unlock(&kvm->lock); + mutex_unlock(&zdev->kzdev_lock); + return r; +} + +int kvm_s390_pci_init(void) +{ + aift = kzalloc(sizeof(struct zpci_aift), GFP_KERNEL); + if (!aift) + return -ENOMEM; + + spin_lock_init(&aift->gait_lock); + mutex_init(&aift->aift_lock); + + return 0; +} + +void kvm_s390_pci_exit(void) +{ + mutex_destroy(&aift->aift_lock); + + kfree(aift); +} diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h new file mode 100644 index 000000000000..3a3606c3a0fe --- /dev/null +++ b/arch/s390/kvm/pci.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * s390 kvm PCI passthrough support + * + * Copyright IBM Corp. 2022 + * + * Author(s): Matthew Rosato <mjrosato@linux.ibm.com> + */ + +#ifndef __KVM_S390_PCI_H +#define __KVM_S390_PCI_H + +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <asm/airq.h> +#include <asm/cpu.h> + +struct kvm_zdev { + struct zpci_dev *zdev; + struct kvm *kvm; + struct zpci_fib fib; + struct list_head entry; +}; + +struct zpci_gaite { + u32 gisa; + u8 gisc; + u8 count; + u8 reserved; + u8 aisbo; + u64 aisb; +}; + +struct zpci_aift { + struct zpci_gaite *gait; + struct airq_iv *sbv; + struct kvm_zdev **kzdev; + spinlock_t gait_lock; /* Protects the gait, used during AEN forward */ + struct mutex aift_lock; /* Protects the other structures in aift */ +}; + +extern struct zpci_aift *aift; + +static inline struct kvm *kvm_s390_pci_si_to_kvm(struct zpci_aift *aift, + unsigned long si) +{ + if (!IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) || aift->kzdev == 0 || + aift->kzdev[si] == 0) + return 0; + return aift->kzdev[si]->kvm; +}; + +int kvm_s390_pci_aen_init(u8 nisc); +void kvm_s390_pci_aen_exit(void); + +void kvm_s390_pci_init_list(struct kvm *kvm); +void kvm_s390_pci_clear_list(struct kvm *kvm); + +int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args); + +int kvm_s390_pci_init(void); +void kvm_s390_pci_exit(void); + +static inline bool kvm_s390_pci_interp_allowed(void) +{ + struct cpuid cpu_id; + + get_cpu_id(&cpu_id); + switch (cpu_id.machine) { + case 0x2817: + case 0x2818: + case 0x2827: + case 0x2828: + case 0x2964: + case 0x2965: + /* No SHM on certain machines */ + return false; + default: + return (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM) && + sclp.has_zpci_lsi && sclp.has_aeni && sclp.has_aisi && + sclp.has_aisii); + } +} + +#endif /* __KVM_S390_PCI_H */ diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 83bb5cf97282..3335fa09b6f1 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -442,7 +442,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu) vcpu->stat.instruction_ipte_interlock++; if (psw_bits(vcpu->arch.sie_block->gpsw).pstate) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu)); + wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu->kvm)); kvm_s390_retry_instr(vcpu); VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation"); return 0; @@ -873,10 +873,18 @@ static int handle_stsi(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - if (fc > 3) { - kvm_s390_set_psw_cc(vcpu, 3); - return 0; - } + /* Bailout forbidden function codes */ + if (fc > 3 && fc != 15) + goto out_no_data; + + /* + * fc 15 is provided only with + * - PTF/CPU topology support through facility 15 + * - KVM_CAP_S390_USER_STSI + */ + if (fc == 15 && (!test_kvm_facility(vcpu->kvm, 11) || + !vcpu->kvm->arch.user_stsi)) + goto out_no_data; if (vcpu->run->s.regs.gprs[0] & 0x0fffff00 || vcpu->run->s.regs.gprs[1] & 0xffff0000) @@ -910,6 +918,10 @@ static int handle_stsi(struct kvm_vcpu *vcpu) goto out_no_data; handle_stsi_3_2_2(vcpu, (void *) mem); break; + case 15: /* fc 15 is fully handled in userspace */ + insert_stsi_usr_data(vcpu, operand2, ar, fc, sel1, sel2); + trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2); + return -EREMOTE; } if (kvm_s390_pv_cpu_is_protected(vcpu)) { memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, @@ -1471,7 +1483,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) access_key = (operand2 & 0xf0) >> 4; if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_lock(vcpu); + ipte_lock(vcpu->kvm); ret = guest_translate_address_with_key(vcpu, address, ar, &gpa, GACC_STORE, access_key); @@ -1508,7 +1520,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu) } if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT) - ipte_unlock(vcpu); + ipte_unlock(vcpu->kvm); return ret; } diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c index b4a499b10b67..7cb7799a0acb 100644 --- a/arch/s390/kvm/pv.c +++ b/arch/s390/kvm/pv.c @@ -13,8 +13,19 @@ #include <asm/gmap.h> #include <asm/uv.h> #include <asm/mman.h> +#include <linux/pagewalk.h> +#include <linux/sched/mm.h> +#include <linux/mmu_notifier.h> #include "kvm-s390.h" +static void kvm_s390_clear_pv_state(struct kvm *kvm) +{ + kvm->arch.pv.handle = 0; + kvm->arch.pv.guest_len = 0; + kvm->arch.pv.stor_base = 0; + kvm->arch.pv.stor_var = NULL; +} + int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) { int cc; @@ -109,7 +120,7 @@ static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) vfree(kvm->arch.pv.stor_var); free_pages(kvm->arch.pv.stor_base, get_order(uv_info.guest_base_stor_len)); - memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); + kvm_s390_clear_pv_state(kvm); } static int kvm_s390_pv_alloc_vm(struct kvm *kvm) @@ -153,21 +164,51 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) { int cc; - /* make all pages accessible before destroying the guest */ - s390_reset_acc(kvm->mm); - cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), UVC_CMD_DESTROY_SEC_CONF, rc, rrc); WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); - atomic_set(&kvm->mm->context.is_protected, 0); + /* + * if the mm still has a mapping, make all its pages accessible + * before destroying the guest + */ + if (mmget_not_zero(kvm->mm)) { + s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); + mmput(kvm->mm); + } + + if (!cc) { + atomic_dec(&kvm->mm->context.protected_count); + kvm_s390_pv_dealloc_vm(kvm); + } else { + /* Intended memory leak on "impossible" error */ + s390_replace_asce(kvm->arch.gmap); + } KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); - /* Inteded memory leak on "impossible" error */ - if (!cc) - kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; } +static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct kvm *kvm = container_of(subscription, struct kvm, arch.pv.mmu_notifier); + u16 dummy; + + /* + * No locking is needed since this is the last thread of the last user of this + * struct mm. + * When the struct kvm gets deinitialized, this notifier is also + * unregistered. This means that if this notifier runs, then the + * struct kvm is still valid. + */ + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); +} + +static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { + .release = kvm_s390_pv_mmu_notifier_release, +}; + int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) { struct uv_cb_cgc uvcb = { @@ -198,14 +239,22 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) /* Outputs */ kvm->arch.pv.handle = uvcb.guest_handle; + atomic_inc(&kvm->mm->context.protected_count); if (cc) { - if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); - else + } else { + atomic_dec(&kvm->mm->context.protected_count); kvm_s390_pv_dealloc_vm(kvm); + } return -EIO; } kvm->arch.gmap->guest_handle = uvcb.guest_handle; + /* Add the notifier only once. No races because we hold kvm->lock */ + if (kvm->arch.pv.mmu_notifier.ops != &kvm_s390_pv_mmu_notifier_ops) { + kvm->arch.pv.mmu_notifier.ops = &kvm_s390_pv_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.pv.mmu_notifier, kvm->mm); + } return 0; } @@ -225,8 +274,6 @@ int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, *rrc = uvcb.header.rrc; KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", *rc, *rrc); - if (!cc) - atomic_set(&kvm->mm->context.is_protected, 1); return cc ? -EINVAL : 0; } diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 8aaee2892ec3..cb747bf6c798 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -480,9 +480,9 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu) struct kvm_vcpu *dest_vcpu; u8 order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); - if (order_code == SIGP_EXTERNAL_CALL) { + trace_kvm_s390_handle_sigp_pei(vcpu, order_code, cpu_addr); + dest_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, cpu_addr); BUG_ON(dest_vcpu == NULL); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index dada78b92691..94138f8f0c1c 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -503,6 +503,14 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; + /* + * CPU Topology + * This facility only uses the utility field of the SCA and none of + * the cpu entries that are problematic with the other interpretation + * facilities so we can pass it through + */ + if (test_kvm_facility(vcpu->kvm, 11)) + scb_s->ecb |= scb_o->ecb & ECB_PTF; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73) && wants_tx) { /* remap the prefix is tx is toggled on */ diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e173b6187ad5..ee7871f770fb 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -754,6 +754,7 @@ void do_secure_storage_access(struct pt_regs *regs) struct vm_area_struct *vma; struct mm_struct *mm; struct page *page; + struct gmap *gmap; int rc; /* @@ -783,6 +784,17 @@ void do_secure_storage_access(struct pt_regs *regs) } switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_ACCESS_FLAGS, VM_FAULT_BADMAP); + break; + } + fallthrough; case USER_FAULT: mm = current->mm; mmap_read_lock(mm); @@ -811,7 +823,6 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); break; - case GMAP_FAULT: default: do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); WARN_ON_ONCE(1); @@ -837,6 +848,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access); void do_secure_storage_violation(struct pt_regs *regs) { + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; /* * Either KVM messed up the secure guest mapping or the same * page is mapped into multiple secure guests. diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b8ae4a4aa2ba..62758cb5872f 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2697,41 +2697,168 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index bc980fd313d5..73cdc5539384 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -61,6 +61,12 @@ DEFINE_STATIC_KEY_FALSE(have_mio); static struct kmem_cache *zdev_fmb_cache; +/* AEN structures that must be preserved over KVM module re-insertion */ +union zpci_sic_iib *zpci_aipb; +EXPORT_SYMBOL_GPL(zpci_aipb); +struct airq_iv *zpci_aif_sbv; +EXPORT_SYMBOL_GPL(zpci_aif_sbv); + struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -120,11 +126,13 @@ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, fib.pba = base; fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); return cc; } +EXPORT_SYMBOL_GPL(zpci_register_ioat); /* Modify PCI: Unregister I/O address translation parameters */ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) @@ -133,6 +141,8 @@ int zpci_unregister_ioat(struct zpci_dev *zdev, u8 dmaas) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc) zpci_dbg(3, "unreg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); @@ -160,6 +170,7 @@ int zpci_fmb_enable_device(struct zpci_dev *zdev) atomic64_set(&zdev->unmapped_pages, 0); fib.fmb_addr = virt_to_phys(zdev->fmb); + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc) { kmem_cache_free(zdev_fmb_cache, zdev->fmb); @@ -178,6 +189,8 @@ int zpci_fmb_disable_device(struct zpci_dev *zdev) if (!zdev->fmb) return -EINVAL; + fib.gd = zdev->gisa; + /* Function measurement is disabled if fmb address is zero */ cc = zpci_mod_fc(req, &fib, &status); if (cc == 3) /* Function already gone. */ @@ -700,6 +713,7 @@ int zpci_enable_device(struct zpci_dev *zdev) zpci_update_fh(zdev, fh); return rc; } +EXPORT_SYMBOL_GPL(zpci_enable_device); int zpci_disable_device(struct zpci_dev *zdev) { @@ -723,6 +737,7 @@ int zpci_disable_device(struct zpci_dev *zdev) } return rc; } +EXPORT_SYMBOL_GPL(zpci_disable_device); /** * zpci_hot_reset_device - perform a reset of the given zPCI function @@ -816,6 +831,7 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) kref_init(&zdev->kref); mutex_init(&zdev->lock); + mutex_init(&zdev->kzdev_lock); rc = zpci_init_iommu(zdev); if (rc) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 375e0a5120bc..ee367798e388 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -106,6 +106,8 @@ static void clp_store_query_pci_fngrp(struct zpci_dev *zdev, zdev->max_msi = response->noi; zdev->fmb_update = response->mui; zdev->version = response->version; + zdev->maxstbl = response->maxstbl; + zdev->dtsm = response->dtsm; switch (response->version) { case 1: @@ -229,12 +231,16 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma { struct clp_req_rsp_set_pci *rrb; int rc, retries = 100; + u32 gisa = 0; *fh = 0; rrb = clp_alloc_block(GFP_KERNEL); if (!rrb) return -ENOMEM; + if (command != CLP_SET_DISABLE_PCI_FN) + gisa = zdev->gisa; + do { memset(rrb, 0, sizeof(*rrb)); rrb->request.hdr.len = sizeof(rrb->request); @@ -243,6 +249,7 @@ static int clp_set_pci_fn(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as, u8 comma rrb->request.fh = zdev->fh; rrb->request.oc = command; rrb->request.ndas = nr_dma_as; + rrb->request.gisa = gisa; rc = clp_req(rrb, CLP_LPS_PCI); if (rrb->response.hdr.rsp == CLP_RC_SETPCIFN_BUSY) { diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 1a822b7799f8..56480be48244 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -92,6 +92,7 @@ u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status) return cc; } +EXPORT_SYMBOL_GPL(zpci_mod_fc); /* Refresh PCI Translations */ static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status) @@ -138,7 +139,7 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) +int zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; @@ -149,6 +150,7 @@ int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) return 0; } +EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl); /* PCI Load */ static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 500cd2dbdf53..a2b42a63a53b 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -11,16 +11,10 @@ #include <asm/isc.h> #include <asm/airq.h> +#include <asm/tpi.h> static enum {FLOATING, DIRECTED} irq_delivery; -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 -#define SIC_IRQ_MODE_DIRECT 4 -#define SIC_IRQ_MODE_D_ALL 16 -#define SIC_IRQ_MODE_D_SINGLE 17 -#define SIC_IRQ_MODE_SET_CPU 18 - /* * summary bit vector * FLOATING - summary bit per function @@ -49,6 +43,7 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ fib.fmt0.aisb = virt_to_phys(zpci_sbv->vector) + (zdev->aisb / 64) * 8; fib.fmt0.aisbo = zdev->aisb & 63; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -60,6 +55,8 @@ static int zpci_clear_airq(struct zpci_dev *zdev) struct zpci_fib fib = {0}; u8 cc, status; + fib.gd = zdev->gisa; + cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -78,6 +75,7 @@ static int zpci_set_directed_irq(struct zpci_dev *zdev) fib.fmt = 1; fib.fmt1.noi = zdev->msi_nr_irqs; fib.fmt1.dibvo = zdev->msi_first_bit; + fib.gd = zdev->gisa; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -90,6 +88,7 @@ static int zpci_clear_directed_irq(struct zpci_dev *zdev) u8 cc, status; fib.fmt = 1; + fib.gd = zdev->gisa; cc = zpci_mod_fc(req, &fib, &status); if (cc == 3 || (cc == 1 && status == 24)) /* Function already gone or IRQs already deregistered. */ @@ -153,6 +152,7 @@ static struct irq_chip zpci_irq_chip = { static void zpci_handle_cpu_local_irq(bool rescan) { struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + union zpci_sic_iib iib = {{0}}; unsigned long bit; int irqs_on = 0; @@ -164,7 +164,7 @@ static void zpci_handle_cpu_local_irq(bool rescan) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &iib)) break; bit = 0; continue; @@ -192,6 +192,7 @@ static void zpci_handle_remote_irq(void *data) static void zpci_handle_fallback_irq(void) { struct cpu_irq_data *cpu_data; + union zpci_sic_iib iib = {{0}}; unsigned long cpu; int irqs_on = 0; @@ -202,7 +203,7 @@ static void zpci_handle_fallback_irq(void) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; cpu = 0; continue; @@ -216,8 +217,11 @@ static void zpci_handle_fallback_irq(void) } } -static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_directed_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + bool floating = !tpi_info->directed_irq; + if (floating) { inc_irq_stat(IRQIO_PCF); zpci_handle_fallback_irq(); @@ -227,8 +231,10 @@ static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) } } -static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_floating_irq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { + union zpci_sic_iib iib = {{0}}; unsigned long si, ai; struct airq_iv *aibv; int irqs_on = 0; @@ -242,7 +248,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib)) break; si = 0; continue; @@ -291,7 +297,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) zdev->aisb = bit; /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK, NULL); if (!zdev->aibv) return -ENOMEM; @@ -402,11 +408,12 @@ static struct airq_struct zpci_airq = { static void __init cpu_enable_directed_irq(void *unused) { union zpci_sic_iib iib = {{0}}; + union zpci_sic_iib ziib = {{0}}; iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; - __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); - zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC, &ziib); } static int __init zpci_directed_irq_init(void) @@ -414,14 +421,14 @@ static int __init zpci_directed_irq_init(void) union zpci_sic_iib iib = {{0}}; unsigned int cpu; - zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + zpci_sbv = airq_iv_create(num_possible_cpus(), 0, NULL); if (!zpci_sbv) return -ENOMEM; iib.diib.isc = PCI_ISC; iib.diib.nr_cpus = num_possible_cpus(); iib.diib.disb_addr = virt_to_phys(zpci_sbv->vector); - __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), GFP_KERNEL); @@ -436,7 +443,7 @@ static int __init zpci_directed_irq_init(void) zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, AIRQ_IV_DATA | AIRQ_IV_CACHELINE | - (!cpu ? AIRQ_IV_ALLOC : 0)); + (!cpu ? AIRQ_IV_ALLOC : 0), NULL); if (!zpci_ibv[cpu]) return -ENOMEM; } @@ -453,7 +460,7 @@ static int __init zpci_floating_irq_init(void) if (!zpci_ibv) return -ENOMEM; - zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC, NULL); if (!zpci_sbv) goto out_free; @@ -466,6 +473,7 @@ out_free: int __init zpci_irq_init(void) { + union zpci_sic_iib iib = {{0}}; int rc; irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; @@ -497,7 +505,7 @@ int __init zpci_irq_init(void) * Enable floating IRQs (with suppression after one IRQ). When using * directed IRQs this enables the fallback path. */ - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC, &iib); return 0; out_airq: diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 530dd941d140..cb0aff5c0187 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -111,6 +111,7 @@ static struct facility_def facility_defs[] = { 193, /* bear enhancement facility */ 194, /* rdp enhancement facility */ 196, /* processor activity instrumentation facility */ + 197, /* processor activity instrumentation extension 1 */ -1 /* END */ } }, diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index dd313ff57df3..d15b0d541de3 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -45,6 +45,10 @@ static void __init sclp_early_facilities_detect(void) sclp.has_gisaf = !!(sccb->fac118 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); sclp.has_kss = !!(sccb->fac98 & 0x01); + sclp.has_aisii = !!(sccb->fac118 & 0x40); + sclp.has_aeni = !!(sccb->fac118 & 0x20); + sclp.has_aisi = !!(sccb->fac118 & 0x10); + sclp.has_zpci_lsi = !!(sccb->fac118 & 0x01); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; if (sccb->fac91 & 0x40) diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index c0ed364bf446..34967e67249e 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -99,7 +99,7 @@ static irqreturn_t do_airq_interrupt(int irq, void *dummy) rcu_read_lock(); hlist_for_each_entry_rcu(airq, head, list) if ((*airq->lsi_ptr & airq->lsi_mask) != 0) - airq->handler(airq, !tpi_info->directed_irq); + airq->handler(airq, tpi_info); rcu_read_unlock(); return IRQ_HANDLED; @@ -122,10 +122,12 @@ static inline unsigned long iv_size(unsigned long bits) * airq_iv_create - create an interrupt vector * @bits: number of bits in the interrupt vector * @flags: allocation flags + * @vec: pointer to pinned guest memory if AIRQ_IV_GUESTVEC * * Returns a pointer to an interrupt vector structure */ -struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags) +struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags, + unsigned long *vec) { struct airq_iv *iv; unsigned long size; @@ -146,6 +148,8 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags) &iv->vector_dma); if (!iv->vector) goto out_free; + } else if (flags & AIRQ_IV_GUESTVEC) { + iv->vector = vec; } else { iv->vector = cio_dma_zalloc(size); if (!iv->vector) @@ -185,7 +189,7 @@ out_free: kfree(iv->avail); if (iv->flags & AIRQ_IV_CACHELINE && iv->vector) dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma); - else + else if (!(iv->flags & AIRQ_IV_GUESTVEC)) cio_dma_free(iv->vector, size); kfree(iv); out: @@ -204,7 +208,7 @@ void airq_iv_release(struct airq_iv *iv) kfree(iv->bitlock); if (iv->flags & AIRQ_IV_CACHELINE) dma_pool_free(airq_iv_cache, iv->vector, iv->vector_dma); - else + else if (!(iv->flags & AIRQ_IV_GUESTVEC)) cio_dma_free(iv->vector, iv_size(iv->bits)); kfree(iv->avail); kfree(iv); diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 8e09bf3a2fcd..9b9335dd06db 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -15,6 +15,7 @@ #include <asm/qdio.h> #include <asm/airq.h> #include <asm/isc.h> +#include <asm/tpi.h> #include "cio.h" #include "ioasm.h" @@ -93,9 +94,10 @@ static inline u32 clear_shared_ind(void) /** * tiqdio_thinint_handler - thin interrupt handler for qdio * @airq: pointer to adapter interrupt descriptor - * @floating: flag to recognize floating vs. directed interrupts (unused) + * @tpi_info: interrupt information (e.g. floating vs directed -- unused) */ -static void tiqdio_thinint_handler(struct airq_struct *airq, bool floating) +static void tiqdio_thinint_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { u64 irq_time = S390_lowcore.int_clock; u32 si_used = clear_shared_ind(); diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 5c13d2079d96..1b85f21c151b 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -27,6 +27,7 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <asm/airq.h> +#include <asm/tpi.h> #include <linux/atomic.h> #include <asm/isc.h> #include <linux/hrtimer.h> @@ -131,7 +132,8 @@ static int ap_max_adapter_id = 63; static struct bus_type ap_bus_type; /* Adapter interrupt definitions */ -static void ap_interrupt_handler(struct airq_struct *airq, bool floating); +static void ap_interrupt_handler(struct airq_struct *airq, + struct tpi_info *tpi_info); static bool ap_irq_flag; @@ -452,9 +454,10 @@ static enum hrtimer_restart ap_poll_timeout(struct hrtimer *unused) /** * ap_interrupt_handler() - Schedule ap_tasklet on interrupt * @airq: pointer to adapter interrupt descriptor - * @floating: ignored + * @tpi_info: ignored */ -static void ap_interrupt_handler(struct airq_struct *airq, bool floating) +static void ap_interrupt_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { inc_irq_stat(IRQIO_APB); tasklet_schedule(&ap_tasklet); diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 97e51c34e6cf..51bef226d33f 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -33,6 +33,7 @@ #include <asm/virtio-ccw.h> #include <asm/isc.h> #include <asm/airq.h> +#include <asm/tpi.h> /* * virtio related functions @@ -204,7 +205,8 @@ static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info) write_unlock_irqrestore(&info->lock, flags); } -static void virtio_airq_handler(struct airq_struct *airq, bool floating) +static void virtio_airq_handler(struct airq_struct *airq, + struct tpi_info *tpi_info) { struct airq_info *info = container_of(airq, struct airq_info, airq); unsigned long ai; @@ -240,7 +242,7 @@ static struct airq_info *new_airq_info(int index) return NULL; rwlock_init(&info->lock); info->aiv = airq_iv_create(VIRTIO_IV_BITS, AIRQ_IV_ALLOC | AIRQ_IV_PTR - | AIRQ_IV_CACHELINE); + | AIRQ_IV_CACHELINE, NULL); if (!info->aiv) { kfree(info); return NULL; diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 4da1914425e1..f9d0c908e738 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -44,6 +44,17 @@ config VFIO_PCI_IGD To enable Intel IGD assignment through vfio-pci, say Y. endif +config VFIO_PCI_ZDEV_KVM + bool "VFIO PCI extensions for s390x KVM passthrough" + depends on S390 && KVM + default y + help + Support s390x-specific extensions to enable support for enhancements + to KVM passthrough capabilities, such as interpretive execution of + zPCI instructions. + + To enable s390x KVM vfio-pci extensions, say Y. + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 7052ebd893e0..24c524224da5 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o -vfio-pci-core-$(CONFIG_S390) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index a0d69ddaf90d..b1e5cfbadf38 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -316,10 +316,14 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) pci_write_config_word(pdev, PCI_COMMAND, cmd); } - ret = vfio_config_init(vdev); + ret = vfio_pci_zdev_open_device(vdev); if (ret) goto out_free_state; + ret = vfio_config_init(vdev); + if (ret) + goto out_free_zdev; + msix_pos = pdev->msix_cap; if (msix_pos) { u16 flags; @@ -340,6 +344,8 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) return 0; +out_free_zdev: + vfio_pci_zdev_close_device(vdev); out_free_state: kfree(vdev->pci_saved_state); vdev->pci_saved_state = NULL; @@ -418,6 +424,8 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) vdev->needs_reset = true; + vfio_pci_zdev_close_device(vdev); + /* * If we have saved state, restore it. If we can reset the device, * even better. Resetting with current state seems better than diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index ea4c0d2b0663..e163aa9f6144 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -11,6 +11,7 @@ #include <linux/uaccess.h> #include <linux/vfio.h> #include <linux/vfio_zdev.h> +#include <linux/kvm_host.h> #include <asm/pci_clp.h> #include <asm/pci_io.h> @@ -23,14 +24,15 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps) { struct vfio_device_info_cap_zpci_base cap = { .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE, - .header.version = 1, + .header.version = 2, .start_dma = zdev->start_dma, .end_dma = zdev->end_dma, .pchid = zdev->pchid, .vfn = zdev->vfn, .fmb_length = zdev->fmb_length, .pft = zdev->pft, - .gid = zdev->pfgid + .gid = zdev->pfgid, + .fh = zdev->fh }; return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); @@ -43,14 +45,16 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps) { struct vfio_device_info_cap_zpci_group cap = { .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP, - .header.version = 1, + .header.version = 2, .dasm = zdev->dma_mask, .msi_addr = zdev->msi_addr, .flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH, .mui = zdev->fmb_update, .noi = zdev->max_msi, .maxstbl = ZPCI_MAX_WRITE_SIZE, - .version = zdev->version + .version = zdev->version, + .reserved = 0, + .imaxstbl = zdev->maxstbl }; return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); @@ -136,3 +140,26 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, return ret; } + +int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) +{ + struct zpci_dev *zdev = to_zpci(vdev->pdev); + + if (!zdev) + return -ENODEV; + + if (!vdev->vdev.kvm) + return 0; + + return kvm_s390_pci_register_kvm(zdev, vdev->vdev.kvm); +} + +void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) +{ + struct zpci_dev *zdev = to_zpci(vdev->pdev); + + if (!zdev || !vdev->vdev.kvm) + return; + + kvm_s390_pci_unregister_kvm(zdev); +} diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 00ed419dd464..f054d0360a75 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -24,7 +24,8 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) || defined(CONFIG_IO_URING) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) || \ + defined(CONFIG_VFIO_PCI_ZDEV_KVM) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 23c176d4b073..d5d9e17f0156 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -206,15 +206,25 @@ static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) } #endif -#ifdef CONFIG_S390 +#ifdef CONFIG_VFIO_PCI_ZDEV_KVM extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps); +int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); +void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); #else static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, struct vfio_info_cap *caps) { return -ENODEV; } + +static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) +{ + return 0; +} + +static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) +{} #endif /* Will be exported for vfio pci drivers usage */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a36e78710382..7e06194129e3 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1167,6 +1167,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_X86_TRIPLE_FAULT_EVENT 218 #define KVM_CAP_X86_NOTIFY_VMEXIT 219 #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 +#define KVM_CAP_S390_ZPCI_OP 221 +#define KVM_CAP_S390_CPU_TOPOLOGY 222 #ifdef KVM_CAP_IRQ_ROUTING @@ -2186,4 +2188,34 @@ struct kvm_stats_desc { #define KVM_X86_NOTIFY_VMEXIT_ENABLED (1ULL << 0) #define KVM_X86_NOTIFY_VMEXIT_USER (1ULL << 1) +/* Available with KVM_CAP_S390_ZPCI_OP */ +#define KVM_S390_ZPCI_OP _IOW(KVMIO, 0xd1, struct kvm_s390_zpci_op) + +struct kvm_s390_zpci_op { + /* in */ + __u32 fh; /* target device */ + __u8 op; /* operation to perform */ + __u8 pad[3]; + union { + /* for KVM_S390_ZPCIOP_REG_AEN */ + struct { + __u64 ibv; /* Guest addr of interrupt bit vector */ + __u64 sb; /* Guest addr of summary bit */ + __u32 flags; + __u32 noi; /* Number of interrupts */ + __u8 isc; /* Guest interrupt subclass */ + __u8 sbo; /* Offset of guest summary bit vector */ + __u16 pad; + } reg_aen; + __u64 reserved[8]; + } u; +}; + +/* types for kvm_s390_zpci_op->op */ +#define KVM_S390_ZPCIOP_REG_AEN 0 +#define KVM_S390_ZPCIOP_DEREG_AEN 1 + +/* flags for kvm_s390_zpci_op->u.reg_aen.flags */ +#define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) + #endif /* __LINUX_KVM_H */ diff --git a/include/uapi/linux/vfio_zdev.h b/include/uapi/linux/vfio_zdev.h index b4309397b6b2..77f2aff1f27e 100644 --- a/include/uapi/linux/vfio_zdev.h +++ b/include/uapi/linux/vfio_zdev.h @@ -29,6 +29,9 @@ struct vfio_device_info_cap_zpci_base { __u16 fmb_length; /* Measurement Block Length (in bytes) */ __u8 pft; /* PCI Function Type */ __u8 gid; /* PCI function group ID */ + /* End of version 1 */ + __u32 fh; /* PCI function handle */ + /* End of version 2 */ }; /** @@ -47,6 +50,10 @@ struct vfio_device_info_cap_zpci_group { __u16 noi; /* Maximum number of MSIs */ __u16 maxstbl; /* Maximum Store Block Length */ __u8 version; /* Supported PCI Version */ + /* End of version 1 */ + __u8 reserved; + __u16 imaxstbl; /* Maximum Interpreted Store Block Length */ + /* End of version 2 */ }; /** |