diff options
Diffstat (limited to 'kernel')
55 files changed, 1251 insertions, 442 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 8e09f0f55b4b..17b0d523afb3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -102,12 +102,13 @@ struct audit_net { * This struct is RCU protected; you must either hold the RCU lock for reading * or the associated spinlock for writing. */ -static struct auditd_connection { +struct auditd_connection { struct pid *pid; u32 portid; struct net *net; struct rcu_head rcu; -} *auditd_conn = NULL; +}; +static struct auditd_connection __rcu *auditd_conn; static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index ca52b9642943..d4f335a9a899 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -44,14 +44,19 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } -struct tnum tnum_arshift(struct tnum a, u8 min_shift) +struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ - return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); + if (insn_bitness == 32) + return TNUM((u32)(((s32)a.value) >> min_shift), + (u32)(((s32)a.mask) >> min_shift)); + else + return TNUM((s64)a.value >> min_shift, + (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ce85e7041f0c..7d530ce8719d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5049,9 +5049,16 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Upon reaching here, src_known is true and * umax_val is equal to umin_val. */ - dst_reg->smin_value >>= umin_val; - dst_reg->smax_value >>= umin_val; - dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + if (insn_bitness == 32) { + dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); + dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); + } else { + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + } + + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, + insn_bitness); /* blow away the dst_reg umin_value/umax_value and rely on * dst_reg var_off to refine the result. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 735af8f15f95..1e12e6928bca 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3055,8 +3055,6 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); - WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); - if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) continue; @@ -3066,6 +3064,8 @@ static int cgroup_apply_control_enable(struct cgroup *cgrp) return PTR_ERR(css); } + WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); + if (css_visible(css)) { ret = css_populate_dir(css); if (ret) @@ -3101,11 +3101,11 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp) for_each_subsys(ss, ssid) { struct cgroup_subsys_state *css = cgroup_css(dsct, ss); - WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); - if (!css) continue; + WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt)); + if (css->parent && !(cgroup_ss_mask(dsct) & (1 << ss->id))) { kill_css(css); @@ -3392,7 +3392,8 @@ static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf, if (strcmp(strstrip(buf), "threaded")) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn, false); + /* drain dying csses before we re-apply (threaded) subtree control */ + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENOENT; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b48b22d4deb6..6f87352f8219 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,7 +33,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we * see NULL updated_next or they see our updated stat. */ smp_mb(); diff --git a/kernel/cpu.c b/kernel/cpu.c index a59cc980adad..4dc279ed3b2d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1909,6 +1909,78 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) } EXPORT_SYMBOL(__cpuhp_remove_state); +#ifdef CONFIG_HOTPLUG_SMT +static void cpuhp_offline_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = true; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); +} + +static void cpuhp_online_cpu_device(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); + + dev->offline = false; + /* Tell user space about the state change */ + kobject_uevent(&dev->kobj, KOBJ_ONLINE); +} + +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; + /* + * As this needs to hold the cpu maps lock it's impossible + * to call device_offline() because that ends up calling + * cpu_down() which takes cpu maps lock. cpu maps lock + * needs to be held as this might race against in kernel + * abusers of the hotplug machinery (thermal management). + * + * So nothing would update device:offline state. That would + * leave the sysfs entry stale and prevent onlining after + * smt control has been changed to 'off' again. This is + * called under the sysfs hotplug lock, so it is properly + * serialized against the regular offline usage. + */ + cpuhp_offline_cpu_device(cpu); + } + if (!ret) + cpu_smt_control = ctrlval; + cpu_maps_update_done(); + return ret; +} + +int cpuhp_smt_enable(void) +{ + int cpu, ret = 0; + + cpu_maps_update_begin(); + cpu_smt_control = CPU_SMT_ENABLED; + for_each_present_cpu(cpu) { + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; + /* See comment in cpuhp_smt_disable() */ + cpuhp_online_cpu_device(cpu); + } + cpu_maps_update_done(); + return ret; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) static ssize_t show_cpuhp_state(struct device *dev, struct device_attribute *attr, char *buf) @@ -2063,77 +2135,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static void cpuhp_offline_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = true; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_OFFLINE); -} - -static void cpuhp_online_cpu_device(unsigned int cpu) -{ - struct device *dev = get_cpu_device(cpu); - - dev->offline = false; - /* Tell user space about the state change */ - kobject_uevent(&dev->kobj, KOBJ_ONLINE); -} - -int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - for_each_online_cpu(cpu) { - if (topology_is_primary_thread(cpu)) - continue; - ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); - if (ret) - break; - /* - * As this needs to hold the cpu maps lock it's impossible - * to call device_offline() because that ends up calling - * cpu_down() which takes cpu maps lock. cpu maps lock - * needs to be held as this might race against in kernel - * abusers of the hotplug machinery (thermal management). - * - * So nothing would update device:offline state. That would - * leave the sysfs entry stale and prevent onlining after - * smt control has been changed to 'off' again. This is - * called under the sysfs hotplug lock, so it is properly - * serialized against the regular offline usage. - */ - cpuhp_offline_cpu_device(cpu); - } - if (!ret) - cpu_smt_control = ctrlval; - cpu_maps_update_done(); - return ret; -} - -int cpuhp_smt_enable(void) -{ - int cpu, ret = 0; - - cpu_maps_update_begin(); - cpu_smt_control = CPU_SMT_ENABLED; - for_each_present_cpu(cpu) { - /* Skip online CPUs and CPUs on offline nodes */ - if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) - continue; - ret = _cpu_up(cpu, 0, CPUHP_ONLINE); - if (ret) - break; - /* See comment in cpuhp_smt_disable() */ - cpuhp_online_cpu_device(cpu); - } - cpu_maps_update_done(); - return ret; -} - - static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) diff --git a/kernel/cred.c b/kernel/cred.c index 9ed51b70ed80..809a985b1793 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -175,8 +175,8 @@ void exit_creds(struct task_struct *tsk) put_cred(cred); #ifdef CONFIG_KEYS_REQUEST_CACHE - key_put(current->cached_requested_key); - current->cached_requested_key = NULL; + key_put(tsk->cached_requested_key); + tsk->cached_requested_key = NULL; #endif } diff --git a/kernel/events/core.c b/kernel/events/core.c index a1f8bde19b56..2173c23c25b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11465,8 +11465,10 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) + if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { + err = -EINVAL; goto err_locked; + } /* * Must be under the same ctx::mutex as perf_install_in_context(), diff --git a/kernel/fork.c b/kernel/fork.c index 080809560072..ef82feb4bddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1832,6 +1832,7 @@ static __latent_entropy struct task_struct *copy_process( struct multiprocess_signals delayed; struct file *pidfile = NULL; u64 clone_flags = args->flags; + struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different @@ -1874,8 +1875,16 @@ static __latent_entropy struct task_struct *copy_process( */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + (task_active_pid_ns(current) != nsp->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } + + /* + * If the new process will be in a different time namespace + * do not allow it to share VM or a thread group with the forking task. + */ + if (clone_flags & (CLONE_THREAD | CLONE_VM)) { + if (nsp->time_ns != nsp->time_ns_for_children) return ERR_PTR(-EINVAL); } @@ -2821,7 +2830,8 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| + CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/futex.c b/kernel/futex.c index 03c518e9747e..0cf84c8664f2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1178,6 +1178,7 @@ out_error: /** * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status * @exiting: Pointer to the exiting task * * Caller must hold a refcount on @exiting. diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6c7ca2e983a5..02236b13b359 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -12,6 +12,7 @@ #include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/irq.h> +#include <linux/sched/isolation.h> #include "internals.h" @@ -171,6 +172,20 @@ void irq_migrate_all_off_this_cpu(void) } } +static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) +{ + const struct cpumask *hk_mask; + + if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + return false; + + hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) + return false; + + return cpumask_test_cpu(cpu, hk_mask); +} + static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) { struct irq_data *data = irq_desc_get_irq_data(desc); @@ -188,9 +203,11 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) /* * If the interrupt can only be directed to a single target * CPU then it is already assigned to a CPU in the affinity - * mask. No point in trying to move it around. + * mask. No point in trying to move it around unless the + * isolation mechanism requests to move it to an upcoming + * housekeeping CPU. */ - if (!irqd_is_single_target(data)) + if (!irqd_is_single_target(data) || hk_should_isolate(data, cpu)) irq_set_affinity_locked(data, affinity, false); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5b8fdd659e54..98a5f10d1900 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -891,6 +891,7 @@ __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) + __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dd822fd8a7d5..7527e5ef6fe5 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -987,6 +987,23 @@ const struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); /** + * irq_domain_translate_onecell() - Generic translate for direct one cell + * bindings + */ +int irq_domain_translate_onecell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 1)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = IRQ_TYPE_NONE; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_onecell); + +/** * irq_domain_translate_twocell() - Generic translate for direct two cell * bindings * @@ -1459,6 +1476,7 @@ int irq_domain_push_irq(struct irq_domain *domain, int virq, void *arg) if (rv) { /* Restore the original irq_data. */ *root_irq_data = *child_irq_data; + kfree(child_irq_data); goto error; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1753486b440c..818b2802d3e7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,7 @@ #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/sched/task.h> +#include <linux/sched/isolation.h> #include <uapi/linux/sched/types.h> #include <linux/task_work.h> @@ -217,7 +218,45 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - ret = chip->irq_set_affinity(data, mask, force); + /* + * If this is a managed interrupt and housekeeping is enabled on + * it check whether the requested affinity mask intersects with + * a housekeeping CPU. If so, then remove the isolated CPUs from + * the mask and just keep the housekeeping CPU(s). This prevents + * the affinity setter from routing the interrupt to an isolated + * CPU to avoid that I/O submitted from a housekeeping CPU causes + * interrupts on an isolated one. + * + * If the masks do not intersect or include online CPU(s) then + * keep the requested mask. The isolated target CPUs are only + * receiving interrupts when the I/O operation was submitted + * directly from them. + * + * If all housekeeping CPUs in the affinity mask are offline, the + * interrupt will be migrated by the CPU hotplug code once a + * housekeeping CPU which belongs to the affinity mask comes + * online. + */ + if (irqd_affinity_is_managed(data) && + housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { + const struct cpumask *hk_mask, *prog_mask; + + static DEFINE_RAW_SPINLOCK(tmp_mask_lock); + static struct cpumask tmp_mask; + + hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + + raw_spin_lock(&tmp_mask_lock); + cpumask_and(&tmp_mask, mask, hk_mask); + if (!cpumask_intersects(&tmp_mask, cpu_online_mask)) + prog_mask = mask; + else + prog_mask = &tmp_mask; + ret = chip->irq_set_affinity(data, prog_mask, force); + raw_spin_unlock(&tmp_mask_lock); + } else { + ret = chip->irq_set_affinity(data, mask, force); + } switch (ret) { case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: @@ -1500,8 +1539,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * has. The type flags are unreliable as the * underlying chip implementation can override them. */ - pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n", - irq); + pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n", + new->name, irq); ret = -EINVAL; goto out_unlock; } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2ed97a7c9b2a..f865e5f4d382 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -34,6 +34,7 @@ static atomic_t irq_poll_active; * true and let the handler run. */ bool irq_wait_for_poll(struct irq_desc *desc) + __must_hold(&desc->lock) { if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), "irq poll in progress on cpu %d for irq %d\n", diff --git a/kernel/kexec.c b/kernel/kexec.c index bc933c0db9bf..f977786fe498 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -159,6 +159,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, kimage_terminate(image); + ret = machine_kexec_post_load(image); + if (ret) + goto out; + /* Install the new kernel and uninstall the old */ image = xchg(dest_image, image); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 15d70a90b50d..c19c0dad1ebe 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -589,6 +589,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unusable_pages); } + +int __weak machine_kexec_post_load(struct kimage *image) +{ + return 0; +} + void kimage_terminate(struct kimage *image) { if (*image->entry != 0) @@ -1171,7 +1177,7 @@ int kernel_kexec(void) * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); - pr_emerg("Starting new kernel\n"); + pr_notice("Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index a2df93948665..faa74d5f6941 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -441,6 +441,10 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, kimage_terminate(image); + ret = machine_kexec_post_load(image); + if (ret) + goto out; + /* * Free up any temporary buffers allocated which are not needed * after image has been loaded diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d..39d30ccf8d87 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,6 +13,8 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); +int machine_kexec_post_load(struct kimage *image); + extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 32282e7112d3..32406ef0d6a2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -482,7 +482,7 @@ static struct lock_trace *save_trace(void) struct lock_trace *trace, *t2; struct hlist_head *hash_head; u32 hash; - unsigned int max_entries; + int max_entries; BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE); BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES); @@ -490,10 +490,8 @@ static struct lock_trace *save_trace(void) trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries); max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries - LOCK_TRACE_SIZE_IN_LONGS; - trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES - - LOCK_TRACE_SIZE_IN_LONGS - 1) { + if (max_entries <= 0) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -502,6 +500,7 @@ static struct lock_trace *save_trace(void) return NULL; } + trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3); hash = jhash(trace->entries, trace->nr_entries * sizeof(trace->entries[0]), 0); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 44e68761f432..0d9b6be9ecc8 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1226,8 +1226,8 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if ((wstate == WRITER_HANDOFF) && - (rwsem_spin_on_owner(sem, 0) == OWNER_NULL)) + if (wstate == WRITER_HANDOFF && + rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index c815f58e6bc0..ed9882108cd2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -18,6 +18,7 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> +#include <linux/time_namespace.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_CGROUPS .cgroup_ns = &init_cgroup_ns, #endif +#ifdef CONFIG_TIME_NS + .time_ns = &init_time_ns, + .time_ns_for_children = &init_time_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_net; } + new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns, + tsk->nsproxy->time_ns_for_children); + if (IS_ERR(new_nsp->time_ns_for_children)) { + err = PTR_ERR(new_nsp->time_ns_for_children); + goto out_time; + } + new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + return new_nsp; +out_time: + put_net(new_nsp->net_ns); out_net: put_cgroup_ns(new_nsp->cgroup_ns); out_cgroup: @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; + int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP)))) { - get_nsproxy(old_ns); - return 0; - } - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + get_nsproxy(old_ns); + return 0; + } + } else if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; /* @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); + ret = timens_on_fork(new_ns, tsk); + if (ret) { + free_nsproxy(new_ns); + return ret; + } + tsk->nsproxy = new_ns; return 0; } @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + if (ns->time_ns) + put_time_ns(ns->time_ns); + if (ns->time_ns_for_children) + put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | + CLONE_NEWTIME))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d3667b4075c1..7cbfbeacd68a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,7 +27,10 @@ config SUSPEND_SKIP_SYNC Skip the kernel sys_sync() before freezing user processes. Some systems prefer not to pay this cost on every invocation of suspend, or they are content with invoking sync() from - user-space before invoking suspend. Say Y if that's your case. + user-space before invoking suspend. There's a run-time switch + at '/sys/power/sync_on_suspend' to configure this behaviour. + This setting changes the default for the run-tim switch. Say Y + to change the default to disable the kernel sys_sync(). config HIBERNATE_CALLBACKS bool diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 3c0a5a8170b0..6dbeedb7354c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -9,7 +9,7 @@ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> */ -#define pr_fmt(fmt) "PM: " fmt +#define pr_fmt(fmt) "PM: hibernation: " fmt #include <linux/export.h> #include <linux/suspend.h> @@ -106,7 +106,7 @@ EXPORT_SYMBOL(system_entering_hibernation); #ifdef CONFIG_PM_DEBUG static void hibernation_debug_sleep(void) { - pr_info("hibernation debug: Waiting for 5 seconds.\n"); + pr_info("debug: Waiting for 5 seconds.\n"); mdelay(5000); } @@ -277,7 +277,7 @@ static int create_image(int platform_mode) error = dpm_suspend_end(PMSG_FREEZE); if (error) { - pr_err("Some devices failed to power down, aborting hibernation\n"); + pr_err("Some devices failed to power down, aborting\n"); return error; } @@ -295,7 +295,7 @@ static int create_image(int platform_mode) error = syscore_suspend(); if (error) { - pr_err("Some system devices failed to power down, aborting hibernation\n"); + pr_err("Some system devices failed to power down, aborting\n"); goto Enable_irqs; } @@ -310,7 +310,7 @@ static int create_image(int platform_mode) restore_processor_state(); trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false); if (error) - pr_err("Error %d creating hibernation image\n", error); + pr_err("Error %d creating image\n", error); if (!in_suspend) { events_check_enabled = false; @@ -680,7 +680,7 @@ static int load_image_and_restore(void) if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - pr_err("Failed to load hibernation image, recovering.\n"); + pr_err("Failed to load image, recovering.\n"); swsusp_free(); free_basic_memory_bitmaps(); Unlock: @@ -743,7 +743,7 @@ int hibernate(void) else flags |= SF_CRC32_MODE; - pm_pr_dbg("Writing image.\n"); + pm_pr_dbg("Writing hibernation image.\n"); error = swsusp_write(flags); swsusp_free(); if (!error) { @@ -755,7 +755,7 @@ int hibernate(void) in_suspend = 0; pm_restore_gfp_mask(); } else { - pm_pr_dbg("Image restored successfully.\n"); + pm_pr_dbg("Hibernation image restored successfully.\n"); } Free_bitmaps: @@ -894,7 +894,7 @@ static int software_resume(void) goto Close_Finish; } - pm_pr_dbg("Preparing processes for restore.\n"); + pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); if (error) goto Close_Finish; @@ -903,7 +903,7 @@ static int software_resume(void) Finish: __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL); pm_restore_console(); - pr_info("resume from hibernation failed (%d)\n", error); + pr_info("resume failed (%d)\n", error); atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ Unlock: @@ -1068,7 +1068,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, lock_system_sleep(); swsusp_resume_device = res; unlock_system_sleep(); - pm_pr_dbg("Configured resume from disk to %u\n", swsusp_resume_device); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", + swsusp_resume_device); noresume = 0; software_resume(); return n; diff --git a/kernel/power/main.c b/kernel/power/main.c index e26de7af520b..69b7a8aeca3b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -190,6 +190,38 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr } power_attr(mem_sleep); + +/* + * sync_on_suspend: invoke ksys_sync_helper() before suspend. + * + * show() returns whether ksys_sync_helper() is invoked before suspend. + * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. + */ +bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); + +static ssize_t sync_on_suspend_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", sync_on_suspend_enabled); +} + +static ssize_t sync_on_suspend_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (kstrtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + sync_on_suspend_enabled = !!val; + return n; +} + +power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG @@ -855,6 +887,7 @@ static struct attribute * g[] = { &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, + &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 26b9168321e7..ddade80ad276 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -8,7 +8,7 @@ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ -#define pr_fmt(fmt) "PM: " fmt +#define pr_fmt(fmt) "PM: hibernation: " fmt #include <linux/version.h> #include <linux/module.h> @@ -1147,24 +1147,24 @@ void free_basic_memory_bitmaps(void) void clear_free_pages(void) { -#ifdef CONFIG_PAGE_POISONING_ZERO struct memory_bitmap *bm = free_pages_map; unsigned long pfn; if (WARN_ON(!(free_pages_map))) return; - memory_bm_position_reset(bm); - pfn = memory_bm_next_pfn(bm); - while (pfn != BM_END_OF_MAP) { - if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); - + if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (pfn_valid(pfn)) + clear_highpage(pfn_to_page(pfn)); + + pfn = memory_bm_next_pfn(bm); + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); } - memory_bm_position_reset(bm); - pr_info("free pages cleared after restore\n"); -#endif /* PAGE_POISONING_ZERO */ } /** @@ -1566,9 +1566,7 @@ static unsigned long preallocate_image_highmem(unsigned long nr_pages) */ static unsigned long __fraction(u64 x, u64 multiplier, u64 base) { - x *= multiplier; - do_div(x, base); - return (unsigned long)x; + return div64_u64(x * multiplier, base); } static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, @@ -1705,16 +1703,20 @@ int hibernate_preallocate_memory(void) ktime_t start, stop; int error; - pr_info("Preallocating image memory... "); + pr_info("Preallocating image memory\n"); start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate original bitmap\n"); goto err_out; + } error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); - if (error) + if (error) { + pr_err("Cannot allocate copy bitmap\n"); goto err_out; + } alloc_normal = 0; alloc_highmem = 0; @@ -1804,8 +1806,11 @@ int hibernate_preallocate_memory(void) alloc -= pages; pages += pages_highmem; pages_highmem = preallocate_image_highmem(alloc); - if (pages_highmem < alloc) + if (pages_highmem < alloc) { + pr_err("Image allocation is %lu pages short\n", + alloc - pages_highmem); goto err_out; + } pages += pages_highmem; /* * size is the desired number of saveable pages to leave in @@ -1836,13 +1841,12 @@ int hibernate_preallocate_memory(void) out: stop = ktime_get(); - pr_cont("done (allocated %lu pages)\n", pages); + pr_info("Allocated %lu pages for snapshot\n", pages); swsusp_show_speed(start, stop, pages, "Allocated"); return 0; err_out: - pr_cont("\n"); swsusp_free(); return -ENOMEM; } @@ -1976,7 +1980,7 @@ asmlinkage __visible int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - pr_info("Creating hibernation image:\n"); + pr_info("Creating image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); @@ -2010,7 +2014,7 @@ asmlinkage __visible int swsusp_save(void) nr_copy_pages = nr_pages; nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); - pr_info("Hibernation image created (%d pages copied)\n", nr_pages); + pr_info("Image created (%d pages copied)\n", nr_pages); return 0; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f3b7239f1892..2c47280fbfc7 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -564,7 +564,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); - if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + if (sync_on_suspend_enabled) { trace_suspend_resume(TPS("sync_filesystems"), 0, true); ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 60564b58de07..e1ed58adb69e 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -70,7 +70,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) static char info_test[] __initdata = KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; - unsigned long now; + time64_t now; struct rtc_wkalrm alm; int status; @@ -81,10 +81,10 @@ repeat: printk(err_readtime, dev_name(&rtc->dev), status); return; } - rtc_tm_to_time(&alm.time, &now); + now = rtc_tm_to_time64(&alm.time); memset(&alm, 0, sizeof alm); - rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + rtc_time64_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); alm.enabled = true; status = rtc_set_alarm(rtc, &alm); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index cb9ddcc08119..43d6179508d6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,12 +264,17 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) +static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, + unsigned int mode) { + int ret; + if (mode & PTRACE_MODE_NOAUDIT) - return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); else - return has_ns_capability(current, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); + + return ret == 0; } /* Returns 0 on success, -errno on denial. */ @@ -321,7 +326,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(tcred->user_ns, mode)) + if (ptrace_has_cap(cred, tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -340,7 +345,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(mm->user_ns, mode))) + !ptrace_has_cap(cred, mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); diff --git a/kernel/rseq.c b/kernel/rseq.c index 27c48eb7de40..a4f86a9d6937 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -310,6 +310,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int ret; if (flags & RSEQ_FLAG_UNREGISTER) { + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 9fcb2a695a41..008d6ac2342b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -163,6 +163,12 @@ static int __init housekeeping_isolcpus_setup(char *str) continue; } + if (!strncmp(str, "managed_irq,", 12)) { + str += 12; + flags |= HK_FLAG_MANAGED_IRQ; + continue; + } + pr_warn("isolcpus: Error, unknown flag\n"); return 0; } diff --git a/kernel/smp.c b/kernel/smp.c index 7dbcb402c2fc..3b7bedc97af3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -395,22 +395,9 @@ call: } EXPORT_SYMBOL_GPL(smp_call_function_any); -/** - * smp_call_function_many(): Run a function on a set of other CPUs. - * @mask: The set of cpus to run on (only runs on online subset). - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. Preemption - * must be disabled when calling this function. - */ -void smp_call_function_many(const struct cpumask *mask, - smp_call_func_t func, void *info, bool wait) +static void smp_call_function_many_cond(const struct cpumask *mask, + smp_call_func_t func, void *info, + bool wait, smp_cond_func_t cond_func) { struct call_function_data *cfd; int cpu, next_cpu, this_cpu = smp_processor_id(); @@ -448,7 +435,8 @@ void smp_call_function_many(const struct cpumask *mask, /* Fastpath: do that cpu by itself. */ if (next_cpu >= nr_cpu_ids) { - smp_call_function_single(cpu, func, info, wait); + if (!cond_func || (cond_func && cond_func(cpu, info))) + smp_call_function_single(cpu, func, info, wait); return; } @@ -465,6 +453,9 @@ void smp_call_function_many(const struct cpumask *mask, for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); + if (cond_func && !cond_func(cpu, info)) + continue; + csd_lock(csd); if (wait) csd->flags |= CSD_FLAG_SYNCHRONOUS; @@ -486,6 +477,26 @@ void smp_call_function_many(const struct cpumask *mask, } } } + +/** + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +void smp_call_function_many(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + smp_call_function_many_cond(mask, func, info, wait, NULL); +} EXPORT_SYMBOL(smp_call_function_many); /** @@ -668,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask); * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. - * @gfp_flags: GFP flags to use when allocating the cpumask - * used internally by the function. - * - * The function might sleep if the GFP flags indicates a non - * atomic allocation is allowed. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. @@ -680,46 +686,27 @@ EXPORT_SYMBOL(on_each_cpu_mask); * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ -void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags, const struct cpumask *mask) +void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, const struct cpumask *mask) { - cpumask_var_t cpus; - int cpu, ret; - - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - - if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { - preempt_disable(); - for_each_cpu(cpu, mask) - if (cond_func(cpu, info)) - __cpumask_set_cpu(cpu, cpus); - on_each_cpu_mask(cpus, func, info, wait); - preempt_enable(); - free_cpumask_var(cpus); - } else { - /* - * No free cpumask, bother. No matter, we'll - * just have to IPI them one by one. - */ - preempt_disable(); - for_each_cpu(cpu, mask) - if (cond_func(cpu, info)) { - ret = smp_call_function_single(cpu, func, - info, wait); - WARN_ON_ONCE(ret); - } - preempt_enable(); + int cpu = get_cpu(); + + smp_call_function_many_cond(mask, func, info, wait, cond_func); + if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) { + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); } + put_cpu(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait) { - on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, - cpu_online_mask); + on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 1867044800bb..c8f00168afe8 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o +obj-$(CONFIG_TIME_NS) += namespace.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 451f9d05ccfe..2ffb466af77e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -26,6 +26,7 @@ #include <linux/freezer.h> #include <linux/compat.h> #include <linux/module.h> +#include <linux/time_namespace.h> #include "posix-timers.h" @@ -36,13 +37,15 @@ * struct alarm_base - Alarm timer bases * @lock: Lock for syncrhonized access to the base * @timerqueue: Timerqueue head managing the list of events - * @gettime: Function to read the time correlating to the base + * @get_ktime: Function to read the time correlating to the base + * @get_timespec: Function to read the namespace time correlating to the base * @base_clockid: clockid for the base */ static struct alarm_base { spinlock_t lock; struct timerqueue_head timerqueue; - ktime_t (*gettime)(void); + ktime_t (*get_ktime)(void); + void (*get_timespec)(struct timespec64 *tp); clockid_t base_clockid; } alarm_bases[ALARM_NUMTYPE]; @@ -55,8 +58,6 @@ static DEFINE_SPINLOCK(freezer_delta_lock); #endif #ifdef CONFIG_RTC_CLASS -static struct wakeup_source *ws; - /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; @@ -66,8 +67,6 @@ static DEFINE_SPINLOCK(rtcdev_lock); * alarmtimer_get_rtcdev - Return selected rtcdevice * * This function returns the rtc device to use for wakealarms. - * If one has not already been chosen, it checks to see if a - * functional rtc device is available. */ struct rtc_device *alarmtimer_get_rtcdev(void) { @@ -87,7 +86,8 @@ static int alarmtimer_rtc_add_device(struct device *dev, { unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); - struct wakeup_source *__ws; + struct platform_device *pdev; + int ret = 0; if (rtcdev) return -EBUSY; @@ -97,26 +97,31 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (!device_may_wakeup(rtc->dev.parent)) return -1; - __ws = wakeup_source_register(dev, "alarmtimer"); + pdev = platform_device_register_data(dev, "alarmtimer", + PLATFORM_DEVID_AUTO, NULL, 0); + if (!IS_ERR(pdev)) + device_init_wakeup(&pdev->dev, true); spin_lock_irqsave(&rtcdev_lock, flags); - if (!rtcdev) { + if (!IS_ERR(pdev) && !rtcdev) { if (!try_module_get(rtc->owner)) { - spin_unlock_irqrestore(&rtcdev_lock, flags); - return -1; + ret = -1; + goto unlock; } rtcdev = rtc; /* hold a reference so it doesn't go away */ get_device(dev); - ws = __ws; - __ws = NULL; + pdev = NULL; + } else { + ret = -1; } +unlock: spin_unlock_irqrestore(&rtcdev_lock, flags); - wakeup_source_unregister(__ws); + platform_device_unregister(pdev); - return 0; + return ret; } static inline void alarmtimer_rtc_timer_init(void) @@ -138,11 +143,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} -#define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } static inline void alarmtimer_rtc_timer_init(void) { } @@ -207,7 +207,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) spin_unlock_irqrestore(&base->lock, flags); if (alarm->function) - restart = alarm->function(alarm, base->gettime()); + restart = alarm->function(alarm, base->get_ktime()); spin_lock_irqsave(&base->lock, flags); if (restart != ALARMTIMER_NORESTART) { @@ -217,7 +217,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) } spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_fired(alarm, base->gettime()); + trace_alarmtimer_fired(alarm, base->get_ktime()); return ret; } @@ -225,7 +225,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) ktime_t alarm_expires_remaining(const struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - return ktime_sub(alarm->node.expires, base->gettime()); + return ktime_sub(alarm->node.expires, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_expires_remaining); @@ -270,7 +270,7 @@ static int alarmtimer_suspend(struct device *dev) spin_unlock_irqrestore(&base->lock, flags); if (!next) continue; - delta = ktime_sub(next->expires, base->gettime()); + delta = ktime_sub(next->expires, base->get_ktime()); if (!min || (delta < min)) { expires = next->expires; min = delta; @@ -281,7 +281,7 @@ static int alarmtimer_suspend(struct device *dev) return 0; if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { - __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + pm_wakeup_event(dev, 2 * MSEC_PER_SEC); return -EBUSY; } @@ -296,7 +296,7 @@ static int alarmtimer_suspend(struct device *dev) /* Set alarm, if in the past reject suspend briefly to handle */ ret = rtc_timer_start(rtc, &rtctimer, now, 0); if (ret < 0) - __pm_wakeup_event(ws, MSEC_PER_SEC); + pm_wakeup_event(dev, MSEC_PER_SEC); return ret; } @@ -364,7 +364,7 @@ void alarm_start(struct alarm *alarm, ktime_t start) hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_start(alarm, base->gettime()); + trace_alarmtimer_start(alarm, base->get_ktime()); } EXPORT_SYMBOL_GPL(alarm_start); @@ -377,7 +377,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add_safe(start, base->gettime()); + start = ktime_add_safe(start, base->get_ktime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -414,7 +414,7 @@ int alarm_try_to_cancel(struct alarm *alarm) alarmtimer_dequeue(base, alarm); spin_unlock_irqrestore(&base->lock, flags); - trace_alarmtimer_cancel(alarm, base->gettime()); + trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; } EXPORT_SYMBOL_GPL(alarm_try_to_cancel); @@ -474,7 +474,7 @@ u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) { struct alarm_base *base = &alarm_bases[alarm->type]; - return alarm_forward(alarm, base->gettime(), interval); + return alarm_forward(alarm, base->get_ktime(), interval); } EXPORT_SYMBOL_GPL(alarm_forward_now); @@ -500,7 +500,7 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) return; } - delta = ktime_sub(absexp, base->gettime()); + delta = ktime_sub(absexp, base->get_ktime()); spin_lock_irqsave(&freezer_delta_lock, flags); if (!freezer_delta || (delta < freezer_delta)) { @@ -632,7 +632,7 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, struct alarm_base *base = &alarm_bases[alarm->type]; if (!absolute) - expires = ktime_add_safe(expires, base->gettime()); + expires = ktime_add_safe(expires, base->get_ktime()); if (sigev_none) alarm->node.expires = expires; else @@ -657,24 +657,41 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp } /** - * alarm_clock_get - posix clock_get interface + * alarm_clock_get_timespec - posix clock_get_timespec interface * @which_clock: clockid * @tp: timespec to fill. * - * Provides the underlying alarm base time. + * Provides the underlying alarm base time in a tasks time namespace. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) +static int alarm_clock_get_timespec(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec64(base->gettime()); + base->get_timespec(tp); + return 0; } /** + * alarm_clock_get_ktime - posix clock_get_ktime interface + * @which_clock: clockid + * + * Provides the underlying alarm base time in the root namespace. + */ +static ktime_t alarm_clock_get_ktime(clockid_t which_clock) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -EINVAL; + + return base->get_ktime(); +} + +/** * alarm_timer_create - posix timer_create interface * @new_timer: k_itimer pointer to manage * @@ -747,7 +764,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, struct timespec64 rmt; ktime_t rem; - rem = ktime_sub(absexp, alarm_bases[type].gettime()); + rem = ktime_sub(absexp, alarm_bases[type].get_ktime()); if (rem <= 0) return 0; @@ -816,9 +833,11 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { - ktime_t now = alarm_bases[type].gettime(); + ktime_t now = alarm_bases[type].get_ktime(); exp = ktime_add_safe(now, exp); + } else { + exp = timens_ktime_to_host(which_clock, exp); } ret = alarmtimer_do_nsleep(&alarm, exp, type); @@ -837,7 +856,8 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, const struct k_clock alarm_clock = { .clock_getres = alarm_clock_getres, - .clock_get = alarm_clock_get, + .clock_get_ktime = alarm_clock_get_ktime, + .clock_get_timespec = alarm_clock_get_timespec, .timer_create = alarm_timer_create, .timer_set = common_timer_set, .timer_del = common_timer_del, @@ -866,6 +886,12 @@ static struct platform_driver alarmtimer_driver = { } }; +static void get_boottime_timespec(struct timespec64 *tp) +{ + ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); +} + /** * alarmtimer_init - Initialize alarm timer code * @@ -874,17 +900,18 @@ static struct platform_driver alarmtimer_driver = { */ static int __init alarmtimer_init(void) { - struct platform_device *pdev; - int error = 0; + int error; int i; alarmtimer_rtc_timer_init(); /* Initialize alarm bases */ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; - alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real; + alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64, alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; - alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime; + alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec; for (i = 0; i < ALARM_NUMTYPE; i++) { timerqueue_init_head(&alarm_bases[i].timerqueue); spin_lock_init(&alarm_bases[i].lock); @@ -898,15 +925,7 @@ static int __init alarmtimer_init(void) if (error) goto out_if; - pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); - if (IS_ERR(pdev)) { - error = PTR_ERR(pdev); - goto out_drv; - } return 0; - -out_drv: - platform_driver_unregister(&alarmtimer_driver); out_if: alarmtimer_rtc_interface_remove(); return error; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8de90ea31280..3a609e7344f3 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1477,7 +1477,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, - unsigned long flags) + unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1910,8 +1910,8 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(const struct timespec64 *rqtp, - const enum hrtimer_mode mode, const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; @@ -1923,7 +1923,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; @@ -1958,7 +1958,8 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif @@ -1978,7 +1979,8 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, + CLOCK_MONOTONIC); } #endif diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c new file mode 100644 index 000000000000..12858507d75a --- /dev/null +++ b/kernel/time/namespace.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/time_namespace.h> +#include <linux/user_namespace.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/seq_file.h> +#include <linux/proc_ns.h> +#include <linux/export.h> +#include <linux/time.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <linux/mm.h> + +#include <vdso/datapage.h> + +ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, + struct timens_offsets *ns_offsets) +{ + ktime_t offset; + + switch (clockid) { + case CLOCK_MONOTONIC: + offset = timespec64_to_ktime(ns_offsets->monotonic); + break; + case CLOCK_BOOTTIME: + case CLOCK_BOOTTIME_ALARM: + offset = timespec64_to_ktime(ns_offsets->boottime); + break; + default: + return tim; + } + + /* + * Check that @tim value is in [offset, KTIME_MAX + offset] + * and subtract offset. + */ + if (tim < offset) { + /* + * User can specify @tim *absolute* value - if it's lesser than + * the time namespace's offset - it's already expired. + */ + tim = 0; + } else { + tim = ktime_sub(tim, offset); + if (unlikely(tim > KTIME_MAX)) + tim = KTIME_MAX; + } + + return tim; +} + +static struct ucounts *inc_time_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES); +} + +static void dec_time_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES); +} + +/** + * clone_time_ns - Clone a time namespace + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * Clone @old_ns and set the clone refcount to 1 + * + * Return: The new namespace or ERR_PTR. + */ +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, + struct time_namespace *old_ns) +{ + struct time_namespace *ns; + struct ucounts *ucounts; + int err; + + err = -ENOSPC; + ucounts = inc_time_namespaces(user_ns); + if (!ucounts) + goto fail; + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + goto fail_dec; + + kref_init(&ns->kref); + + ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!ns->vvar_page) + goto fail_free; + + err = ns_alloc_inum(&ns->ns); + if (err) + goto fail_free_page; + + ns->ucounts = ucounts; + ns->ns.ops = &timens_operations; + ns->user_ns = get_user_ns(user_ns); + ns->offsets = old_ns->offsets; + ns->frozen_offsets = false; + return ns; + +fail_free_page: + __free_page(ns->vvar_page); +fail_free: + kfree(ns); +fail_dec: + dec_time_namespaces(ucounts); +fail: + return ERR_PTR(err); +} + +/** + * copy_time_ns - Create timens_for_children from @old_ns + * @flags: Cloning flags + * @user_ns: User namespace which owns a new namespace. + * @old_ns: Namespace to clone + * + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; + * adds a refcounter to @old_ns otherwise. + * + * Return: timens_for_children namespace or ERR_PTR. + */ +struct time_namespace *copy_time_ns(unsigned long flags, + struct user_namespace *user_ns, struct time_namespace *old_ns) +{ + if (!(flags & CLONE_NEWTIME)) + return get_time_ns(old_ns); + + return clone_time_ns(user_ns, old_ns); +} + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_data->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_data->clock_mode set to VCLOCK_TIMENS which enforces + * the time namespace handling path. + */ +static void timens_setup_vdso_data(struct vdso_data *vdata, + struct time_namespace *ns) +{ + struct timens_offset *offset = vdata->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vdata->seq = 1; + vdata->clock_mode = VCLOCK_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +static DEFINE_MUTEX(offset_lock); + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_data *vdata; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + mutex_lock(&offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + goto out; + + ns->frozen_offsets = true; + vdata = arch_get_vdso_data(page_address(ns->vvar_page)); + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_data(&vdata[i], ns); + +out: + mutex_unlock(&offset_lock); +} + +void free_time_ns(struct kref *kref) +{ + struct time_namespace *ns; + + ns = container_of(kref, struct time_namespace, kref); + dec_time_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + __free_page(ns->vvar_page); + kfree(ns); +} + +static struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} + +static struct ns_common *timens_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static struct ns_common *timens_for_children_get(struct task_struct *task) +{ + struct time_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void timens_put(struct ns_common *ns) +{ + put_time_ns(to_time_ns(ns)); +} + +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +{ + struct time_namespace *ns = to_time_ns(new); + int err; + + if (!current_is_single_threaded()) + return -EUSERS; + + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EPERM; + + timens_set_vvar_page(current, ns); + + err = vdso_join_timens(current, ns); + if (err) + return err; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns_for_children); + nsproxy->time_ns_for_children = ns; + return 0; +} + +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +{ + struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; + struct time_namespace *ns = to_time_ns(nsc); + int err; + + /* create_new_namespaces() already incremented the ref counter */ + if (nsproxy->time_ns == nsproxy->time_ns_for_children) + return 0; + + timens_set_vvar_page(tsk, ns); + + err = vdso_join_timens(tsk, ns); + if (err) + return err; + + get_time_ns(ns); + put_time_ns(nsproxy->time_ns); + nsproxy->time_ns = ns; + + return 0; +} + +static struct user_namespace *timens_owner(struct ns_common *ns) +{ + return to_time_ns(ns)->user_ns; +} + +static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) +{ + seq_printf(m, "%d %lld %ld\n", clockid, ts->tv_sec, ts->tv_nsec); +} + +void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + + ns = timens_for_children_get(p); + if (!ns) + return; + time_ns = to_time_ns(ns); + + show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); + show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); + put_time_ns(time_ns); +} + +int proc_timens_set_offset(struct file *file, struct task_struct *p, + struct proc_timens_offset *offsets, int noffsets) +{ + struct ns_common *ns; + struct time_namespace *time_ns; + struct timespec64 tp; + int i, err; + + ns = timens_for_children_get(p); + if (!ns) + return -ESRCH; + time_ns = to_time_ns(ns); + + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { + put_time_ns(time_ns); + return -EPERM; + } + + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + ktime_get_ts64(&tp); + break; + case CLOCK_BOOTTIME: + ktime_get_boottime_ts64(&tp); + break; + default: + err = -EINVAL; + goto out; + } + + err = -ERANGE; + + if (off->val.tv_sec > KTIME_SEC_MAX || + off->val.tv_sec < -KTIME_SEC_MAX) + goto out; + + tp = timespec64_add(tp, off->val); + /* + * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is + * still unreachable. + */ + if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) + goto out; + } + + mutex_lock(&offset_lock); + if (time_ns->frozen_offsets) { + err = -EACCES; + goto out_unlock; + } + + err = 0; + /* Don't report errors after this line */ + for (i = 0; i < noffsets; i++) { + struct proc_timens_offset *off = &offsets[i]; + struct timespec64 *offset = NULL; + + switch (off->clockid) { + case CLOCK_MONOTONIC: + offset = &time_ns->offsets.monotonic; + break; + case CLOCK_BOOTTIME: + offset = &time_ns->offsets.boottime; + break; + } + + *offset = off->val; + } + +out_unlock: + mutex_unlock(&offset_lock); +out: + put_time_ns(time_ns); + + return err; +} + +const struct proc_ns_operations timens_operations = { + .name = "time", + .type = CLONE_NEWTIME, + .get = timens_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +const struct proc_ns_operations timens_for_children_operations = { + .name = "time_for_children", + .type = CLONE_NEWTIME, + .get = timens_for_children_get, + .put = timens_put, + .install = timens_install, + .owner = timens_owner, +}; + +struct time_namespace init_time_ns = { + .kref = KREF_INIT(3), + .user_ns = &init_user_ns, + .ns.inum = PROC_TIME_INIT_INO, + .ns.ops = &timens_operations, + .frozen_offsets = true, +}; + +static int __init time_ns_init(void) +{ + return 0; +} +subsys_initcall(time_ns_init); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 200fb2d3be99..77c0c2370b6d 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -310,8 +310,8 @@ out: } const struct k_clock clock_posix_dynamic = { - .clock_getres = pc_clock_getres, - .clock_set = pc_clock_settime, - .clock_get = pc_clock_gettime, - .clock_adj = pc_clock_adjtime, + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get_timespec = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 42d512fcfda2..8ff6da77a01f 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1391,26 +1391,26 @@ static int thread_cpu_timer_create(struct k_itimer *timer) } const struct k_clock clock_posix_cpu = { - .clock_getres = posix_cpu_clock_getres, - .clock_set = posix_cpu_clock_set, - .clock_get = posix_cpu_clock_get, - .timer_create = posix_cpu_timer_create, - .nsleep = posix_cpu_nsleep, - .timer_set = posix_cpu_timer_set, - .timer_del = posix_cpu_timer_del, - .timer_get = posix_cpu_timer_get, - .timer_rearm = posix_cpu_timer_rearm, + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get_timespec = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, + .timer_rearm = posix_cpu_timer_rearm, }; const struct k_clock clock_process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, + .clock_getres = process_cpu_clock_getres, + .clock_get_timespec = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, }; const struct k_clock clock_thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .timer_create = thread_cpu_timer_create, + .clock_getres = thread_cpu_clock_getres, + .clock_get_timespec = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, }; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 67df65f887ac..fcb3b21d8bdc 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -14,6 +14,7 @@ #include <linux/ktime.h> #include <linux/timekeeping.h> #include <linux/posix-timers.h> +#include <linux/time_namespace.h> #include <linux/compat.h> #ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER @@ -77,9 +78,11 @@ int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp) break; case CLOCK_MONOTONIC: ktime_get_ts64(tp); + timens_add_monotonic(tp); break; case CLOCK_BOOTTIME: ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); break; default: return -EINVAL; @@ -126,6 +129,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct __kernel_timespec __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -144,13 +148,19 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); +#endif + +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif @@ -212,6 +222,7 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rmtp) { struct timespec64 t; + ktime_t texp; switch (which_clock) { case CLOCK_REALTIME: @@ -230,7 +241,10 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, rmtp = NULL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ? + texp = timespec64_to_ktime(t); + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0ec5b7a1d769..ff0eb30de346 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -30,6 +30,7 @@ #include <linux/hashtable.h> #include <linux/compat.h> #include <linux/nospec.h> +#include <linux/time_namespace.h> #include "timekeeping.h" #include "posix-timers.h" @@ -165,12 +166,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; } +static ktime_t posix_get_realtime_ktime(clockid_t which_clock) +{ + return ktime_get_real(); +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) @@ -187,18 +193,25 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) +static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); + timens_add_monotonic(tp); return 0; } +static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) +{ + return ktime_get(); +} + /* * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -213,6 +226,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); + timens_add_monotonic(tp); return 0; } @@ -222,18 +236,29 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 * return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) +static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); + timens_add_boottime(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) +static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) +{ + return ktime_get_boottime(); +} + +static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } +static ktime_t posix_get_tai_ktime(clockid_t which_clock) +{ + return ktime_get_clocktai(); +} + static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; @@ -645,7 +670,6 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; - struct timespec64 ts64; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; @@ -663,12 +687,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) return; } - /* - * The timespec64 based conversion is suboptimal, but it's not - * worth to implement yet another callback. - */ - kc->clock_get(timr->it_clock, &ts64); - now = timespec64_to_ktime(ts64); + now = kc->clock_get_ktime(timr->it_clock); /* * When a requeue is pending or this is a SIGEV_NONE timer move the @@ -781,7 +800,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic - * functions which use timr->kclock->clock_get() work. + * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. @@ -866,6 +885,8 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); + if (flags & TIMER_ABSTIME) + expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); @@ -1067,7 +1088,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; @@ -1149,7 +1170,7 @@ SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, if (!kc) return -EINVAL; - err = kc->clock_get(which_clock, &ts); + err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; @@ -1200,7 +1221,22 @@ SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { - return hrtimer_nanosleep(rqtp, flags & TIMER_ABSTIME ? + ktime_t texp = timespec64_to_ktime(*rqtp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +static int common_nsleep_timens(const clockid_t which_clock, int flags, + const struct timespec64 *rqtp) +{ + ktime_t texp = timespec64_to_ktime(*rqtp); + + if (flags & TIMER_ABSTIME) + texp = timens_ktime_to_host(which_clock, texp); + + return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } @@ -1261,7 +1297,8 @@ SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_clock_realtime_get, + .clock_get_timespec = posix_get_realtime_timespec, + .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, @@ -1279,8 +1316,9 @@ static const struct k_clock clock_realtime = { static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_ktime_get_ts, - .nsleep = common_nsleep, + .clock_get_timespec = posix_get_monotonic_timespec, + .clock_get_ktime = posix_get_monotonic_ktime, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, @@ -1295,22 +1333,23 @@ static const struct k_clock clock_monotonic = { static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_monotonic_raw, + .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, + .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, + .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_tai, + .clock_get_ktime = posix_get_tai_ktime, + .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, @@ -1326,8 +1365,9 @@ static const struct k_clock clock_tai = { static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, - .clock_get = posix_get_boottime, - .nsleep = common_nsleep, + .clock_get_ktime = posix_get_boottime_ktime, + .clock_get_timespec = posix_get_boottime_timespec, + .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 897c29e162b9..f32a2ebba9b8 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -6,8 +6,11 @@ struct k_clock { struct timespec64 *tp); int (*clock_set)(const clockid_t which_clock, const struct timespec64 *tp); - int (*clock_get)(const clockid_t which_clock, - struct timespec64 *tp); + /* Returns the clock value in the current time namespace. */ + int (*clock_get_timespec)(const clockid_t which_clock, + struct timespec64 *tp); + /* Returns the clock value in the root time namespace. */ + ktime_t (*clock_get_ktime)(const clockid_t which_clock); int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index dbd69052eaa6..e4332e3e2d56 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -169,14 +169,15 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - unsigned long r; + unsigned long r, flags; char r_unit; struct clock_read_data rd; if (cd.rate > rate) return; - WARN_ON(!irqs_disabled()); + /* Cannot register a sched_clock with interrupts on */ + local_irq_save(flags); /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); @@ -233,6 +234,8 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); + local_irq_restore(flags); + pr_debug("Registered %pS as sched_clock source\n", read); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 59225b484e4e..7e5d3524e924 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -11,6 +11,7 @@ #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> +#include <linux/nmi.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> @@ -558,6 +559,7 @@ void tick_unfreeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { + touch_softlockup_watchdog(); tick_resume_local(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8b192e67aabc..a792d21cac64 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -58,8 +58,9 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Do a quick check without holding jiffies_lock: + * The READ_ONCE() pairs with two updates done later in this function. */ - delta = ktime_sub(now, last_jiffies_update); + delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); if (delta < tick_period) return; @@ -70,8 +71,9 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add(last_jiffies_update, tick_period)); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -79,8 +81,10 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); + /* Pairs with the lockless read in this function. */ + WRITE_ONCE(last_jiffies_update, + ktime_add_ns(last_jiffies_update, + incr * ticks)); } do_timer(++ticks); diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 5ee0f7709410..9577c89179cd 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -28,11 +28,6 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; - /* CLOCK_REALTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; - vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; @@ -70,12 +65,6 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; - - /* - * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. - */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); } void update_vsyscall(struct timekeeper *tk) @@ -84,20 +73,17 @@ void update_vsyscall(struct timekeeper *tk) struct vdso_timestamp *vdso_ts; u64 nsec; - if (__arch_update_vdso_data()) { - /* - * Some architectures might want to skip the update of the - * data page. - */ - return; - } - /* copy vsyscall data */ vdso_write_begin(vdata); vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + /* CLOCK_REALTIME also required for time() */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; @@ -110,7 +96,18 @@ void update_vsyscall(struct timekeeper *tk) nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); - update_vdso_data(vdata, tk); + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + + /* + * Architectures can opt out of updating the high resolution part + * of the VDSO. + */ + if (__arch_update_vdso_data()) + update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ddb7e7f5fe8d..5b6ee4aadc26 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9420,6 +9420,11 @@ __init static int tracing_set_default_clock(void) { /* sched_clock_stable() is determined in late_initcall */ if (!trace_boot_clock && !sched_clock_stable()) { + if (security_locked_down(LOCKDOWN_TRACEFS)) { + pr_warn("Can not set tracing clock due to lockdown\n"); + return -EPERM; + } + printk(KERN_WARNING "Unstable clock detected, switching default tracing clock to \"global\"\n" "If you want to keep using the local clock, then add:\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index f62de5f43e79..6ac35b9e195d 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -116,6 +116,7 @@ struct hist_field { struct ftrace_event_field *field; unsigned long flags; hist_field_fn_t fn; + unsigned int ref; unsigned int size; unsigned int offset; unsigned int is_signed; @@ -1766,11 +1767,13 @@ static struct hist_field *find_var(struct hist_trigger_data *hist_data, struct event_trigger_data *test; struct hist_field *hist_field; + lockdep_assert_held(&event_mutex); + hist_field = find_var_field(hist_data, var_name); if (hist_field) return hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -1820,7 +1823,9 @@ static struct hist_field *find_file_var(struct trace_event_file *file, struct event_trigger_data *test; struct hist_field *hist_field; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { test_data = test->private_data; hist_field = find_var_field(test_data, var_name); @@ -2423,8 +2428,16 @@ static int contains_operator(char *str) return field_op; } +static void get_hist_field(struct hist_field *hist_field) +{ + hist_field->ref++; +} + static void __destroy_hist_field(struct hist_field *hist_field) { + if (--hist_field->ref > 1) + return; + kfree(hist_field->var.name); kfree(hist_field->name); kfree(hist_field->type); @@ -2466,6 +2479,8 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field) return NULL; + hist_field->ref = 1; + hist_field->hist_data = hist_data; if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) @@ -2661,6 +2676,17 @@ static struct hist_field *create_var_ref(struct hist_trigger_data *hist_data, { unsigned long flags = HIST_FIELD_FL_VAR_REF; struct hist_field *ref_field; + int i; + + /* Check if the variable already exists */ + for (i = 0; i < hist_data->n_var_refs; i++) { + ref_field = hist_data->var_refs[i]; + if (ref_field->var.idx == var_field->var.idx && + ref_field->var.hist_data == var_field->hist_data) { + get_hist_field(ref_field); + return ref_field; + } + } ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); if (ref_field) { @@ -3115,7 +3141,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, { struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (test->private_data == hist_data) return test->filter_str; @@ -3166,9 +3194,11 @@ find_compatible_hist(struct hist_trigger_data *target_hist_data, struct event_trigger_data *test; unsigned int n_keys; + lockdep_assert_held(&event_mutex); + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; @@ -5528,7 +5558,7 @@ static int hist_show(struct seq_file *m, void *v) goto out_unlock; } - list_for_each_entry_rcu(data, &event_file->triggers, list) { + list_for_each_entry(data, &event_file->triggers, list) { if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) hist_trigger_show(m, data, n++); } @@ -5921,7 +5951,9 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->attrs->name && !named_data) goto new; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6005,10 +6037,12 @@ static bool have_hist_trigger_match(struct event_trigger_data *data, struct event_trigger_data *test, *named_data = NULL; bool match = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (hist_trigger_match(data, test, named_data, false)) { match = true; @@ -6026,10 +6060,12 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6051,10 +6087,12 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, *named_data = NULL; bool unregistered = false; + lockdep_assert_held(&event_mutex); + if (hist_data->attrs->name) named_data = find_named_trigger(hist_data->attrs->name); - list_for_each_entry_rcu(test, &file->triggers, list) { + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (!hist_trigger_match(data, test, named_data, false)) continue; @@ -6080,7 +6118,9 @@ static bool hist_file_check_refs(struct trace_event_file *file) struct hist_trigger_data *hist_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { hist_data = test->private_data; if (check_var_refs(hist_data)) @@ -6323,7 +6363,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, struct enable_trigger_data *enable_data = data->private_data; struct event_trigger_data *test; - list_for_each_entry_rcu(test, &enable_data->file->triggers, list) { + list_for_each_entry_rcu(test, &enable_data->file->triggers, list, + lockdep_is_held(&event_mutex)) { if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { if (enable_data->enable) test->paused = false; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2cd53ca21b51..40106fff06a4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -501,7 +501,9 @@ void update_cond_flag(struct trace_event_file *file) struct event_trigger_data *data; bool set_cond = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->filter || event_command_post_trigger(data->cmd_ops) || event_command_needs_rec(data->cmd_ops)) { set_cond = true; @@ -536,7 +538,9 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { if (test->cmd_ops->trigger_type == data->cmd_ops->trigger_type) { ret = -EEXIST; goto out; @@ -581,7 +585,9 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); @@ -1497,7 +1503,9 @@ int event_enable_register_trigger(char *glob, struct event_trigger_data *test; int ret = 0; - list_for_each_entry_rcu(test, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(test, &file->triggers, list) { test_enable_data = test->private_data; if (test_enable_data && (test->cmd_ops->trigger_type == @@ -1537,7 +1545,9 @@ void event_enable_unregister_trigger(char *glob, struct event_trigger_data *data; bool unregistered = false; - list_for_each_entry_rcu(data, &file->triggers, list) { + lockdep_assert_held(&event_mutex); + + list_for_each_entry(data, &file->triggers, list) { enable_data = data->private_data; if (enable_data && (data->cmd_ops->trigger_type == diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7f890262c8a3..3f54dc2f6e1c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -290,7 +290,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, INIT_HLIST_NODE(&tk->rp.kp.hlist); INIT_LIST_HEAD(&tk->rp.kp.list); - ret = trace_probe_init(&tk->tp, event, group); + ret = trace_probe_init(&tk->tp, event, group, false); if (ret < 0) goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 905b10af5d5c..9ae87be422f2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -984,15 +984,19 @@ void trace_probe_cleanup(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group) + const char *group, bool alloc_filter) { struct trace_event_call *call; + size_t size = sizeof(struct trace_probe_event); int ret = 0; if (!event || !group) return -EINVAL; - tp->event = kzalloc(sizeof(struct trace_probe_event), GFP_KERNEL); + if (alloc_filter) + size += sizeof(struct trace_uprobe_filter); + + tp->event = kzalloc(size, GFP_KERNEL); if (!tp->event) return -ENOMEM; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4ee703728aec..a0ff9e200ef6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -223,6 +223,12 @@ struct probe_arg { const struct fetch_type *type; /* Type of this argument */ }; +struct trace_uprobe_filter { + rwlock_t rwlock; + int nr_systemwide; + struct list_head perf_events; +}; + /* Event call and class holder */ struct trace_probe_event { unsigned int flags; /* For TP_FLAG_* */ @@ -230,6 +236,7 @@ struct trace_probe_event { struct trace_event_call call; struct list_head files; struct list_head probes; + struct trace_uprobe_filter filter[0]; }; struct trace_probe { @@ -322,7 +329,7 @@ static inline bool trace_probe_has_single_file(struct trace_probe *tp) } int trace_probe_init(struct trace_probe *tp, const char *event, - const char *group); + const char *group, bool alloc_filter); void trace_probe_cleanup(struct trace_probe *tp); int trace_probe_append(struct trace_probe *tp, struct trace_probe *to); void trace_probe_unlink(struct trace_probe *tp); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 352073d36585..2619bc5ed520 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -34,12 +34,6 @@ struct uprobe_trace_entry_head { #define DATAOF_TRACE_ENTRY(entry, is_return) \ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) -struct trace_uprobe_filter { - rwlock_t rwlock; - int nr_systemwide; - struct list_head perf_events; -}; - static int trace_uprobe_create(int argc, const char **argv); static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev); static int trace_uprobe_release(struct dyn_event *ev); @@ -60,7 +54,6 @@ static struct dyn_event_operations trace_uprobe_ops = { */ struct trace_uprobe { struct dyn_event devent; - struct trace_uprobe_filter filter; struct uprobe_consumer consumer; struct path path; struct inode *inode; @@ -351,7 +344,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (!tu) return ERR_PTR(-ENOMEM); - ret = trace_probe_init(&tu->tp, event, group); + ret = trace_probe_init(&tu->tp, event, group, true); if (ret < 0) goto error; @@ -359,7 +352,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) tu->consumer.handler = uprobe_dispatcher; if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; - init_trace_uprobe_filter(&tu->filter); + init_trace_uprobe_filter(tu->tp.event->filter); return tu; error: @@ -1067,13 +1060,14 @@ static void __probe_event_disable(struct trace_probe *tp) struct trace_probe *pos; struct trace_uprobe *tu; + tu = container_of(tp, struct trace_uprobe, tp); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); if (!tu->inode) continue; - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); - uprobe_unregister(tu->inode, tu->offset, &tu->consumer); tu->inode = NULL; } @@ -1108,7 +1102,7 @@ static int probe_event_enable(struct trace_event_call *call, } tu = container_of(tp, struct trace_uprobe, tp); - WARN_ON(!uprobe_filter_is_empty(&tu->filter)); + WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); if (enabled) return 0; @@ -1205,39 +1199,39 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) } static inline bool -uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) +trace_uprobe_filter_event(struct trace_uprobe_filter *filter, + struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); + return __uprobe_perf_filter(filter, event->hw.target->mm); } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static bool trace_uprobe_filter_remove(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { list_del(&event->hw.tp_list); - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || (event->hw.target->flags & PF_EXITING) || - uprobe_filter_event(tu, event); + trace_uprobe_filter_event(filter, event); } else { - tu->filter.nr_systemwide--; - done = tu->filter.nr_systemwide; + filter->nr_systemwide--; + done = filter->nr_systemwide; } - write_unlock(&tu->filter.rwlock); + write_unlock(&filter->rwlock); - if (!done) - return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); - - return 0; + return done; } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +/* This returns true if the filter always covers target mm */ +static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, + struct perf_event *event) { bool done; - int err; - write_lock(&tu->filter.rwlock); + write_lock(&filter->rwlock); if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid @@ -1247,28 +1241,21 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) * attr.enable_on_exec means that exec/mmap will install the * breakpoints we need. */ - done = tu->filter.nr_systemwide || + done = filter->nr_systemwide || event->parent || event->attr.enable_on_exec || - uprobe_filter_event(tu, event); - list_add(&event->hw.tp_list, &tu->filter.perf_events); + trace_uprobe_filter_event(filter, event); + list_add(&event->hw.tp_list, &filter->perf_events); } else { - done = tu->filter.nr_systemwide; - tu->filter.nr_systemwide++; + done = filter->nr_systemwide; + filter->nr_systemwide++; } - write_unlock(&tu->filter.rwlock); + write_unlock(&filter->rwlock); - err = 0; - if (!done) { - err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); - if (err) - uprobe_perf_close(tu, event); - } - return err; + return done; } -static int uprobe_perf_multi_call(struct trace_event_call *call, - struct perf_event *event, - int (*op)(struct trace_uprobe *tu, struct perf_event *event)) +static int uprobe_perf_close(struct trace_event_call *call, + struct perf_event *event) { struct trace_probe *pos, *tp; struct trace_uprobe *tu; @@ -1278,25 +1265,59 @@ static int uprobe_perf_multi_call(struct trace_event_call *call, if (WARN_ON_ONCE(!tp)) return -ENODEV; + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) + return 0; + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { tu = container_of(pos, struct trace_uprobe, tp); - ret = op(tu, event); + ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; } return ret; } + +static int uprobe_perf_open(struct trace_event_call *call, + struct perf_event *event) +{ + struct trace_probe *pos, *tp; + struct trace_uprobe *tu; + int err = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + tu = container_of(tp, struct trace_uprobe, tp); + if (trace_uprobe_filter_add(tu->tp.event->filter, event)) + return 0; + + list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); + if (err) { + uprobe_perf_close(call, event); + break; + } + } + + return err; +} + static bool uprobe_perf_filter(struct uprobe_consumer *uc, enum uprobe_filter_ctx ctx, struct mm_struct *mm) { + struct trace_uprobe_filter *filter; struct trace_uprobe *tu; int ret; tu = container_of(uc, struct trace_uprobe, consumer); - read_lock(&tu->filter.rwlock); - ret = __uprobe_perf_filter(&tu->filter, mm); - read_unlock(&tu->filter.rwlock); + filter = tu->tp.event->filter; + + read_lock(&filter->rwlock); + ret = __uprobe_perf_filter(filter, mm); + read_unlock(&filter->rwlock); return ret; } @@ -1419,10 +1440,10 @@ trace_uprobe_register(struct trace_event_call *event, enum trace_reg type, return 0; case TRACE_REG_PERF_OPEN: - return uprobe_perf_multi_call(event, data, uprobe_perf_open); + return uprobe_perf_open(event, data); case TRACE_REG_PERF_CLOSE: - return uprobe_perf_multi_call(event, data, uprobe_perf_close); + return uprobe_perf_close(event, data); #endif default: diff --git a/kernel/up.c b/kernel/up.c index 862b460ab97a..53144d056252 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -68,9 +68,8 @@ EXPORT_SYMBOL(on_each_cpu_mask); * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. */ -void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags, const struct cpumask *mask) +void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait, const struct cpumask *mask) { unsigned long flags; @@ -84,11 +83,10 @@ void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags) +void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, + void *info, bool wait) { - on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); + on_each_cpu_cond_mask(cond_func, func, info, wait, NULL); } EXPORT_SYMBOL(on_each_cpu_cond); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f41334ef0971..b6b1f54a7837 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -161,6 +161,8 @@ static void lockup_detector_update_enable(void) #ifdef CONFIG_SOFTLOCKUP_DETECTOR +#define SOFTLOCKUP_RESET ULONG_MAX + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -173,8 +175,6 @@ static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); static DEFINE_PER_CPU(bool, softlockup_touch_sync); static DEFINE_PER_CPU(bool, soft_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); -static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); -static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; @@ -274,7 +274,7 @@ notrace void touch_softlockup_watchdog_sched(void) * Preemption can be enabled. It doesn't matter which CPU's timestamp * gets zeroed here, so use the raw_ operation. */ - raw_cpu_write(watchdog_touch_ts, 0); + raw_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); } notrace void touch_softlockup_watchdog(void) @@ -298,14 +298,14 @@ void touch_all_softlockup_watchdogs(void) * the softlockup check. */ for_each_cpu(cpu, &watchdog_allowed_mask) - per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(watchdog_touch_ts, cpu) = SOFTLOCKUP_RESET; wq_watchdog_touch(-1); } void touch_softlockup_watchdog_sync(void) { __this_cpu_write(softlockup_touch_sync, true); - __this_cpu_write(watchdog_touch_ts, 0); + __this_cpu_write(watchdog_touch_ts, SOFTLOCKUP_RESET); } static int is_softlockup(unsigned long touch_ts) @@ -350,8 +350,6 @@ static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work); */ static int softlockup_fn(void *data) { - __this_cpu_write(soft_lockup_hrtimer_cnt, - __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); complete(this_cpu_ptr(&softlockup_completion)); @@ -383,7 +381,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* .. and repeat */ hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - if (touch_ts == 0) { + if (touch_ts == SOFTLOCKUP_RESET) { if (unlikely(__this_cpu_read(softlockup_touch_sync))) { /* * If the time stamp was touched atomically @@ -416,22 +414,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; /* only warn once */ - if (__this_cpu_read(soft_watchdog_warn) == true) { - /* - * When multiple processes are causing softlockups the - * softlockup detector only warns on the first one - * because the code relies on a full quiet cycle to - * re-arm. The second process prevents the quiet cycle - * and never gets reported. Use task pointers to detect - * this. - */ - if (__this_cpu_read(softlockup_task_ptr_saved) != - current) { - __this_cpu_write(soft_watchdog_warn, false); - __touch_watchdog(); - } + if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - } if (softlockup_all_cpu_backtrace) { /* Prevent multiple soft-lockup reports if one cpu is already @@ -447,7 +431,6 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); - __this_cpu_write(softlockup_task_ptr_saved, current); print_modules(); print_irqtrace_events(current); if (regs) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cfc923558e04..4bdfa270a37b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2266,7 +2266,7 @@ __acquires(&pool->lock) * While we must be careful to not use "work" after this, the trace * point will only record its address. */ - trace_workqueue_execute_end(work); + trace_workqueue_execute_end(work, worker->current_func); lock_map_release(&lockdep_map); lock_map_release(&pwq->wq->lockdep_map); |