diff options
Diffstat (limited to 'kernel')
42 files changed, 1067 insertions, 530 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..972d9a8e4ac4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34777b3746fa..c5b30fd8a315 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,11 +14,15 @@ #include <linux/filter.h> #include <linux/vmalloc.h> +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; @@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + if ((u64) htab->n_buckets * sizeof(struct bucket) + (u64) htab->elem_size * htab->map.max_entries >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + htab->elem_size * htab->map.max_entries, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - raw_spin_lock_init(&htab->lock); - htab->count = 0; + atomic_set(&htab->count, 0); return &htab->map; @@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_head *head; + struct bucket *b; unsigned long flags; u32 key_size; int ret; @@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, l_new->hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); l_old = lookup_elem_raw(head, l_new->hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { + if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { /* if elem with this 'key' doesn't exist and we've reached * max_entries limit, fail insertion of new elem */ @@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_del_rcu(&l_old->hash_node); kfree_rcu(l_old, rcu); } else { - htab->count++; + atomic_inc(&htab->count); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); kfree(l_new); return ret; } @@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree_rcu(l, rcu); ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree(l); } } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d50b7..f2ece3c174a5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b39550d8485..637397059f76 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7945d10b378..d1d3e8f57de9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 470f6536b9e8..c03a640ef6da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,8 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - #include <linux/atomic.h> +#include <net/sock.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -440,11 +441,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -465,25 +461,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -1647,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1717,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -1924,6 +1888,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -2004,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; @@ -2020,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2027,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2146,9 +2114,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2191,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -4062,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) goto out_err; /* - * Migrate tasks one-by-one until @form is empty. This fails iff + * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { @@ -4903,11 +4878,11 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int level, ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. */ @@ -4918,9 +4893,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (!parent) return -ENODEV; root = parent->root; + level = parent->level + 1; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); if (!cgrp) { ret = -ENOMEM; goto out_unlock; @@ -4944,6 +4921,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5188,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); @@ -5346,6 +5327,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; @@ -5489,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) - return &ss_priv[i - CGROUP_CANFORK_START]; - return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - void **private = subsys_canfork_priv_p(ss_priv, i); - return private ? *private : NULL; -} - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5524,14 +5493,13 @@ void cgroup_fork(struct task_struct *child) * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; for_each_subsys_which(ss, i, &have_canfork_callback) { - ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + ret = ss->can_fork(child); if (ret) goto out_revert; } @@ -5543,7 +5511,7 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + ss->cancel_fork(child); } return ret; @@ -5556,15 +5524,14 @@ out_revert: * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ -void cgroup_cancel_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); + ss->cancel_fork(child); } /** @@ -5577,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child, * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child, - void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; @@ -5622,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child, * and addition to css_set. */ for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); + ss->fork(child); } /** @@ -5822,6 +5788,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2d3df82c54f2..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset) * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task, void *private) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index b50d5a167fda..303097b37429 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge - * succeded, otherwise -EAGAIN. + * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num) { @@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task, void **priv_p) +static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; @@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p) return pids_try_charge(pids, 1); } -static void pids_cancel_fork(struct task_struct *task, void *priv) +static void pids_cancel_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..3e945fcd8179 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -51,6 +51,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/time.h> +#include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> @@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ + time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; @@ -1374,7 +1375,7 @@ out: */ #define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ #define FM_SCALE 1000 /* faux fixed point scale */ @@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp) /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { - time_t now = get_seconds(); - time_t ticks = now - fmp->time; + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; if (ticks == 0) return; diff --git a/kernel/fork.c b/kernel/fork.c index 291b08cc817b..6774e6b2e96d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p, cgrp_ss_priv); + retval = cgroup_can_fork(p); if (retval) goto bad_fork_free_pid; @@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p, cgrp_ss_priv); + cgroup_post_fork(p); threadgroup_change_end(current); perf_event_fork(p); @@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, cgrp_ss_priv); + cgroup_cancel_fork(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 15206453b12a..5797909f4e5b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: @@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); @@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; } + kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); @@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_ack(data); } +EXPORT_SYMBOL_GPL(irq_chip_ack_parent); /** * irq_chip_mask_parent - Mask the parent interrupt diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 239e2ae2c947..0409da0bcc33 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -159,6 +159,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, owner); @@ -171,6 +172,15 @@ err_desc: return NULL; } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + free_masks(desc); + free_percpu(desc->kstat_irqs); + kfree(desc); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -187,9 +197,12 @@ static void free_desc(unsigned int irq) delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); - free_masks(desc); - free_percpu(desc->kstat_irqs); - kfree(desc); + /* + * We free the descriptor, masks and stat fields via RCU. That + * allows demultiplex interrupts to do rcu based management of + * the child interrupts. + */ + call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 22aa9612ef7c..8cf95de1ab3f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -60,6 +60,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data) fwid->fwnode.type = FWNODE_IRQCHIP; return &fwid->fwnode; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle @@ -70,13 +71,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; - if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + if (WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } +EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure @@ -1013,6 +1015,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain @@ -1125,9 +1128,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain, } } -static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) { int ret = 0; struct irq_domain *parent = domain->parent; @@ -1343,6 +1346,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_info - Set the complete data for a @virq in @domain diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6ead200370da..841187239adc 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1743,6 +1743,31 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +/** + * irq_percpu_is_enabled - Check whether the per cpu irq is enabled + * @irq: Linux irq number to check for + * + * Must be called from a non migratable context. Returns the enable + * state of a per cpu interrupt on the current cpu. + */ +bool irq_percpu_is_enabled(unsigned int irq) +{ + unsigned int cpu = smp_processor_id(); + struct irq_desc *desc; + unsigned long flags; + bool is_enabled; + + desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return false; + + is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + irq_put_desc_unlock(desc, flags); + + return is_enabled; +} +EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); + void disable_percpu_irq(unsigned int irq) { unsigned int cpu = smp_processor_id(); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a2a1..15b249e7c673 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -252,6 +252,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, &msi_domain_ops, info); } +int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + int ret; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, arg); + + return ret; +} + +int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, + int virq, int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + struct msi_desc *desc; + int ret = 0; + + for_each_msi_entry(desc, dev) { + /* Don't even try the multi-MSI brain damage. */ + if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { + ret = -EINVAL; + break; + } + + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + ops->set_desc(arg, desc); + /* Assumes the domain mutex is held! */ + ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); + if (ret) + break; + + irq_set_msi_desc_off(virq, 0, desc); + } + + if (ret) { + /* Mop up the damage */ + for_each_msi_entry(desc, dev) { + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + irq_domain_free_irqs_common(domain, desc->irq, 1); + } + } + + return ret; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -270,9 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_desc *desc; int i, ret, virq = -1; - ret = ops->msi_check(domain, info, dev); - if (ret == 0) - ret = ops->msi_prepare(domain, dev, nvec, &arg); + ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..27946975eff0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..efe1b3b17c88 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..09c0597840b0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 77d97a6fc715..44253adb3c36 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task, void *private) +static void cpu_cgroup_fork(struct task_struct *task) { sched_move_task(task); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d5ff5c6bf829..b2ab2ffb1adc 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,9 @@ #include <linux/static_key.h> #include <linux/context_tracking.h> #include "sched.h" +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0623787ec67a..2c5e3a8e00d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid); cond_syscall(sys_setfsgid); cond_syscall(sys_capget); cond_syscall(sys_capset); +cond_syscall(sys_copy_file_range); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..5faf89ac9ec0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2047,9 +2047,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, void *data) { int *i, vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2078,15 +2077,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first=0) { @@ -2094,11 +2087,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); if (!left) break; - err = proc_get_long(&kbuf, &left, &lval, &neg, + err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2125,10 +2118,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2310,9 +2302,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2340,15 +2331,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first = 0) { @@ -2357,9 +2342,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (write) { bool neg; - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); - err = proc_get_long(&kbuf, &left, &val, &neg, + err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2385,10 +2370,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2650,34 +2634,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - unsigned long page = 0; - char *kbuf; + char *kbuf, *p; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { - free_page(page); + kfree(kbuf); return -ENOMEM; } - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; @@ -2688,12 +2665,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, val_b = val_a; if (left) { - kbuf++; + p++; left--; } if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, + err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) @@ -2704,16 +2681,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, break; } if (left) { - kbuf++; + p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); } - free_page(page); + kfree(kbuf); } else { unsigned long bit_a, bit_b = 0; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7fbba635a549..e840ed867a5d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; } + +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + if (rtc) + rtc_timer_cancel(rtc, &rtctimer); + return 0; +} + #else static int alarmtimer_suspend(struct device *dev) { return 0; } + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} #endif static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) @@ -800,6 +816,7 @@ out: /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1347882d131e..664de539299b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -218,8 +218,8 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", - cs->name); + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 149cc8086aea..36f2ca09aa5e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,8 +16,11 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> +#include <linux/math64.h> #include "ntp_internal.h" +#include "timekeeping_internal.h" + /* * NTP timekeeping variables: @@ -70,7 +73,7 @@ static long time_esterror = NTP_PHASE_LIMIT; static s64 time_freq; /* time at last adjustment (secs): */ -static long time_reftime; +static time64_t time_reftime; static long time_adjust; @@ -297,25 +300,27 @@ static void ntp_update_offset(long offset) if (!(time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) + if (!(time_status & STA_NANO)) { + /* Make sure the multiplication below won't overflow */ + offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; + } /* * Scale the phase adjustment and * clamp to the operating range. */ - offset = min(offset, MAXPHASE); - offset = max(offset, -MAXPHASE); + offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = get_seconds() - time_reftime; + secs = (long)(__ktime_get_real_seconds() - time_reftime); if (unlikely(time_status & STA_FREQHOLD)) secs = 0; - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); @@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(unsigned long secs) +int second_overflow(time64_t secs) { s64 delta; int leap = 0; + s32 rem; /* * Leap second processing. If in leap-insert state at the end of the @@ -404,19 +410,19 @@ int second_overflow(unsigned long secs) case TIME_OK: if (time_status & STA_INS) { time_state = TIME_INS; - ntp_next_leap_sec = secs + SECS_PER_DAY - - (secs % SECS_PER_DAY); + div_s64_rem(secs, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } else if (time_status & STA_DEL) { time_state = TIME_DEL; - ntp_next_leap_sec = secs + SECS_PER_DAY - - ((secs+1) % SECS_PER_DAY); + div_s64_rem(secs + 1, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: if (!(time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if (secs % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -427,7 +433,7 @@ int second_overflow(unsigned long secs) if (!(time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if ((secs + 1) % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; @@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; @@ -674,8 +680,14 @@ int ntp_validate_timex(struct timex *txc) return -EINVAL; } - if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) - return -EPERM; + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } /* * Check for potential multiplication overflows that can diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index af924470eac0..d8a7c11fa71a 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -6,7 +6,7 @@ extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(unsigned long secs); +extern int second_overflow(time64_t secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ce033c7aa2e8..9cff0ab82b63 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - int result = 0; + unsigned int result = 0; if (!clk) - return -ENODEV; + return POLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 11ce59916c1a..9cc20af58c76 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -603,15 +603,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* * If the tick is due in the next period, keep it ticking or - * restart it proper. + * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. + */ if (!ts->tick_stopped) goto out; + + /* + * If, OTOH, we did stop it, but there's a pending (expired) + * timer reprogram the timer hardware to fire now. + * + * We will not restart the tick proper, just prod the timer + * hardware into firing an interrupt to process the pending + * timers. Just like tick_irq_exit() will not restart the tick + * for 'normal' interrupts. + * + * Only once we exit the idle loop will we re-enable the tick, + * see tick_nohz_idle_exit(). + */ if (delta == 0) { - /* Tick is stopped, but required now. Enforce it */ tick_nohz_restart(ts, now); goto out; } @@ -701,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int update_cpu_load_nohz(active); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d563c1960302..34b4cedfa80d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -305,8 +305,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) delta = timekeeping_get_delta(tkr); - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; + nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -846,6 +845,19 @@ time64_t ktime_get_real_seconds(void) } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); +/** + * __ktime_get_real_seconds - The same as ktime_get_real_seconds + * but without the sequence counter protect. This internal function + * is called just when timekeeping lock is already held. + */ +time64_t __ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} + + #ifdef CONFIG_NTP_PPS /** @@ -959,7 +971,7 @@ int timekeeping_inject_offset(struct timespec *ts) struct timespec64 ts64, tmp; int ret = 0; - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + if (!timespec_inject_offset_valid(ts)) return -EINVAL; ts64 = timespec_to_timespec64(*ts); @@ -1592,9 +1604,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, { s64 interval = tk->cycle_interval; s64 xinterval = tk->xtime_interval; + u32 base = tk->tkr_mono.clock->mult; + u32 max = tk->tkr_mono.clock->maxadj; + u32 cur_adj = tk->tkr_mono.mult; s64 tick_error; bool negative; - u32 adj; + u32 adj_scale; /* Remove any current error adj from freq calculation */ if (tk->ntp_err_mult) @@ -1613,13 +1628,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, /* preserve the direction of correction */ negative = (tick_error < 0); - /* Sort out the magnitude of the correction */ + /* If any adjustment would pass the max, just return */ + if (negative && (cur_adj - 1) <= (base - max)) + return; + if (!negative && (cur_adj + 1) >= (base + max)) + return; + /* + * Sort out the magnitude of the correction, but + * avoid making so large a correction that we go + * over the max adjustment. + */ + adj_scale = 0; tick_error = abs(tick_error); - for (adj = 0; tick_error > interval; adj++) + while (tick_error > interval) { + u32 adj = 1 << (adj_scale + 1); + + /* Check if adjustment gets us within 1 unit from the max */ + if (negative && (cur_adj - adj) <= (base - max)) + break; + if (!negative && (cur_adj + adj) >= (base + max)) + break; + + adj_scale++; tick_error >>= 1; + } /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj); + timekeeping_apply_adjustment(tk, offset, negative, adj_scale); } /* diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ea005a7f9da..5be76270ec4a 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) { cycle_t ret = (now - last) & mask; - return (s64) ret > 0 ? ret : 0; + /* + * Prevent time going backwards by checking the MSB of mask in + * the result. If set, return 0. + */ + return ret & ~(mask >> 1) ? 0 : ret; } #else static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) @@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) } #endif +extern time64_t __ktime_get_real_seconds(void); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a990824c8604..2aeb6ffc0a1e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, if (count >= BLK_TN_MAX_MSG) return -EINVAL; - msg = kmalloc(count + 1, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } + msg = memdup_user_nul(buffer, count); + if (IS_ERR(msg)) + return PTR_ERR(msg); - msg[count] = '\0'; bt = filp->private_data; __trace_note_message(bt, "%s", msg); kfree(msg); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4228fd3682c3..45dd798bcd37 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -316,7 +316,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -static struct bpf_verifier_ops kprobe_prog_ops = { +static const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b147247..eca592f977b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) - #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ @@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; -static struct ftrace_ops control_ops; - -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -203,7 +196,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void control_ops_disable_all(struct ftrace_ops *ops) +static void per_cpu_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops) *per_cpu_ptr(ops->disabled, cpu) = 1; } -static int control_ops_alloc(struct ftrace_ops *ops) +static int per_cpu_ops_alloc(struct ftrace_ops *ops) { int __percpu *disabled; + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) + return -EINVAL; + disabled = alloc_percpu(int); if (!disabled) return -ENOMEM; ops->disabled = disabled; - control_ops_disable_all(ops); + per_cpu_ops_disable_all(ops); return 0; } @@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { } static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic ops or we force list func, + * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | + FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } -static void add_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int first = *list == &ftrace_list_end; - add_ftrace_ops(list, ops); - if (first) - add_ftrace_ops(&ftrace_ops_list, main_ops); -} - -static int remove_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int ret = remove_ftrace_ops(list, ops); - if (!ret && *list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); - return ret; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops); static int __register_ftrace_function(struct ftrace_ops *ops) @@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - if (control_ops_alloc(ops)) + if (ops->flags & FTRACE_OPS_FL_PER_CPU) { + if (per_cpu_ops_alloc(ops)) return -ENOMEM; - add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - /* The control_ops needs the trampoline update */ - ops = &control_ops; - } else - add_ftrace_ops(&ftrace_ops_list, ops); + } + + add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; @@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - ret = remove_ftrace_list_ops(&ftrace_control_list, - &control_ops, ops); - } else - ret = remove_ftrace_ops(&ftrace_ops_list, ops); + ret = remove_ftrace_ops(&ftrace_ops_list, ops); if (ret < 0) return ret; @@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (all) { /* * Only the filter_hash affects all records. @@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); } -static void print_ip_ins(const char *fmt, unsigned char *p) +static void print_ip_ins(const char *fmt, const unsigned char *p) { int i; @@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + +enum ftrace_bug_type ftrace_bug_type; +const void *ftrace_expected; + +static void print_bug_type(void) +{ + switch (ftrace_bug_type) { + case FTRACE_BUG_UNKNOWN: + break; + case FTRACE_BUG_INIT: + pr_info("Initializing ftrace call sites\n"); + break; + case FTRACE_BUG_NOP: + pr_info("Setting ftrace call site to NOP\n"); + break; + case FTRACE_BUG_CALL: + pr_info("Setting ftrace call site to call ftrace function\n"); + break; + case FTRACE_BUG_UPDATE: + pr_info("Updating ftrace call site to call a different ftrace function\n"); + break; + } +} /** * ftrace_bug - report and shutdown function tracer @@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); + if (ftrace_expected) { + print_ip_ins(" expected: ", ftrace_expected); + pr_cont("\n"); + } break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + print_bug_type(); if (rec) { struct ftrace_ops *ops = NULL; @@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) rec->flags & FTRACE_FL_REGS ? " R" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - pr_cont("\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + pr_cont("\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else pr_cont("\ttramp: ERROR!"); } ip = ftrace_get_addr_curr(rec); - pr_cont(" expected tramp: %lx\n", ip); + pr_cont("\n expected tramp: %lx\n", ip); } } @@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + if (rec->flags & FTRACE_FL_DISABLED) + return FTRACE_UPDATE_IGNORE; + /* * If we are updating calls: * @@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * from the save regs, to a non-save regs function or * vice versa, or from a trampoline call. */ - if (flag & FTRACE_FL_ENABLED) + if (flag & FTRACE_FL_ENABLED) { + ftrace_bug_type = FTRACE_BUG_CALL; return FTRACE_UPDATE_MAKE_CALL; + } + ftrace_bug_type = FTRACE_BUG_UPDATE; return FTRACE_UPDATE_MODIFY_CALL; } @@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) FTRACE_FL_REGS_EN); } + ftrace_bug_type = FTRACE_BUG_NOP; return FTRACE_UPDATE_MAKE_NOP; } @@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, + struct ftrace_ops *op) +{ + unsigned long ip = rec->ip; + + while_for_each_ftrace_op(op) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; @@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; case FTRACE_UPDATE_MAKE_CALL: + ftrace_bug_type = FTRACE_BUG_CALL; return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: + ftrace_bug_type = FTRACE_BUG_NOP; return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: + ftrace_bug_type = FTRACE_BUG_UPDATE; return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { + ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); return 0; } @@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void control_ops_free(struct ftrace_ops *ops) +static void per_cpu_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); } @@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their + * If these are per_cpu ops, they still need their * per_cpu field freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); return 0; } @@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the control + * The same goes for freeing the per_cpu data of the per_cpu * ops. * * Again, normal synchronize_sched() is not good enough. @@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { schedule_on_each_cpu(ftrace_sync); arch_ftrace_trampoline_free(ops); - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); } return 0; @@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - /* If ops traces all mods, we already accounted for it */ + /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 0; + return 1; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && @@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 1; } -static int referenced_filters(struct dyn_ftrace *rec) -{ - struct ftrace_ops *ops; - int cnt = 0; - - for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; - } - - return cnt; -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long update_cnt = 0; - unsigned long ref = 0; - bool test = false; + unsigned long rec_flags = 0; int i; + start = ftrace_now(raw_smp_processor_id()); + /* - * When adding a module, we need to check if tracers are - * currently enabled and if they are set to trace all functions. - * If they are, we need to enable the module functions as well - * as update the reference counts for those function records. + * When a module is loaded, this function is called to convert + * the calls to mcount in its text to nops, and also to create + * an entry in the ftrace data. Now, if ftrace is activated + * after this call, but before the module sets its text to + * read-only, the modification of enabling ftrace can fail if + * the read-only is done while ftrace is converting the calls. + * To prevent this, the module's records are set as disabled + * and will be enabled after the call to set the module's text + * to read-only. */ - if (mod) { - struct ftrace_ops *ops; - - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - if (ops_traces_mod(ops)) - ref++; - else - test = true; - } - } - } - - start = ftrace_now(raw_smp_processor_id()); + if (mod) + rec_flags |= FTRACE_FL_DISABLED; for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { - int cnt = ref; /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - if (test) - cnt += referenced_filters(p); - p->flags = cnt; + p->flags = rec_flags; /* * Do the initial record conversion from mcount jump @@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) break; update_cnt++; - - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p); - } } } @@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - struct ftrace_ops *ops = NULL; + struct ftrace_ops *ops; seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), @@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v) rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - seq_printf(m, "\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + seq_printf(m, "\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + add_trampoline_func(m, ops, rec); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else seq_puts(m, "\ttramp: ERROR!"); - + } else { + add_trampoline_func(m, NULL, rec); } - add_trampoline_func(m, ops, rec); } seq_putc(m, '\n'); @@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod, #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; @@ -4940,41 +4961,112 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +static void ftrace_module_enable(struct module *mod) { - if (ftrace_disabled || start == end) - return; - ftrace_process_locs(mod, start, end); + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + * + * We also delay this to after the module code already set the + * text to read-only, as we now need to set it back to read-write + * so that we can modify the text. + */ + if (ftrace_start_up) + ftrace_arch_code_modify_prepare(); + + do_for_each_ftrace_rec(pg, rec) { + int cnt; + /* + * do_for_each_ftrace_rec() is a double loop. + * module text shares the pg. If a record is + * not part of this module, then skip this pg, + * which the "break" will do. + */ + if (!within_module_core(rec->ip, mod)) + break; + + cnt = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are, and can trace this record, + * we need to enable the module functions as well as update the + * reference counts for those function records. + */ + if (ftrace_start_up) + cnt += referenced_filters(rec); + + /* This clears FTRACE_FL_DISABLED */ + rec->flags = cnt; + + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(rec, 1); + if (failed) { + ftrace_bug(failed, rec); + goto out_loop; + } + } + + } while_for_each_ftrace_rec(); + + out_loop: + if (ftrace_start_up) + ftrace_arch_code_modify_post_process(); + + out_unlock: + mutex_unlock(&ftrace_lock); } void ftrace_module_init(struct module *mod) { - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + if (ftrace_disabled || !mod->num_ftrace_callsites) + return; + + ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); } -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - if (val == MODULE_STATE_GOING) + switch (val) { + case MODULE_STATE_COMING: + ftrace_module_enable(mod); + break; + case MODULE_STATE_GOING: ftrace_release_mod(mod); + break; + default: + break; + } return 0; } #else -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_exit_nb = { - .notifier_call = ftrace_module_notify_exit, +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; @@ -5006,7 +5098,7 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_exit_nb); + ret = register_module_notifier(&ftrace_module_nb); if (ret) pr_warning("Failed to register trace ftrace module exit notifier\n"); @@ -5116,44 +5208,6 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) - return; - - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). - */ - preempt_disable_notrace(); - trace_recursion_set(TRACE_CONTROL_BIT); - - /* - * Control funcs (perf) uses RCU. Only trace if - * RCU is currently active. - */ - if (!rcu_is_watching()) - goto out; - - do_for_each_ftrace_op(op, ftrace_control_list) { - if (!(op->flags & FTRACE_OPS_FL_STUB) && - !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip, regs)) - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - out: - trace_recursion_clear(TRACE_CONTROL_BIT); - preempt_enable_notrace(); -} - -static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(control_ops) -}; - static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) @@ -5170,8 +5224,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); + do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) { + /* + * Check the following for each ops before calling their func: + * if RCU flag is set, then rcu_is_watching() must be true + * if PER_CPU is set, then ftrace_function_local_disable() + * must be false + * Otherwise test if the ip matches the ops filter + * + * If any of the above fails then the op->func() is not executed. + */ + if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && + (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) && + ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -5195,7 +5263,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -5212,20 +5280,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) /* * If there's only one function registered but it does not support - * recursion, this function will be called by the mcount trampoline. - * This function will handle recursion protection. + * recursion, needs RCU protection and/or requires per cpu handling, then + * this function will be called by the mcount trampoline. */ -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, +static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { int bit; + if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) + return; + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - op->func(ip, parent_ip, op, regs); + preempt_disable_notrace(); + if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) { + op->func(ip, parent_ip, op, regs); + } + + preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5243,12 +5320,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. + * If the function does not handle recursion, needs to be RCU safe, + * or does per cpu logic, then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - return ftrace_ops_recurs_func; + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || + ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + return ftrace_ops_assist_func; return ops->func; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27ba3..95181e36891a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old, /* * rb_tail_page_update - move the tail page forward - * - * Returns 1 if moved tail page, 0 if someone else did. */ -static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, +static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { - struct buffer_page *old_tail; unsigned long old_entries; unsigned long old_write; - int ret = 0; /* * The tail page now needs to be moved forward. @@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * it is, then it is up to us to update the tail * pointer. */ - if (tail_page == cpu_buffer->tail_page) { + if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; @@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - old_tail = cmpxchg(&cpu_buffer->tail_page, - tail_page, next_page); - - if (old_tail == tail_page) - ret = 1; + /* Again, either we update tail_page or an interrupt does */ + (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); } - - return ret; } static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, @@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { + struct buffer_page *buffer_tail_page; + + buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ - if (cpu_buffer->tail_page != tail_page && - cpu_buffer->tail_page != next_page) + if (buffer_tail_page != tail_page && + buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); @@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; - u64 ts; next_page = tail_page; @@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, } } - ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); - if (ret) { - /* - * Nested commits always have zero deltas, so - * just reread the time stamp - */ - ts = rb_time_stamp(buffer); - next_page->page->time_stamp = ts; - } + rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now. */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, addr = (unsigned long)event; addr &= PAGE_MASK; - bpage = cpu_buffer->tail_page; + bpage = READ_ONCE(cpu_buffer->tail_page); if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = @@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) again: max_count = cpu_buffer->nr_pages * 100; - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, @@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; + /* Only update the write stamp if the page has an event */ + if (rb_page_write(cpu_buffer->commit_page)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } @@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - tail_page = info->tail_page = cpu_buffer->tail_page; + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 919d9d07686f..8414fa40bf27 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -363,8 +363,8 @@ struct trace_option_dentry { * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer - * @start: called when tracing is unpaused (echo 1 > tracing_enabled) - * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @start: called when tracing is unpaused (echo 1 > tracing_on) + * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened @@ -467,8 +467,6 @@ enum { TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, - TRACE_CONTROL_BIT, - TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a9319be..00df25fd86ef 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; return register_ftrace_function(ops); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4f6ef6912e00..f333e57c4614 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1340,15 +1340,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); @@ -1356,7 +1350,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; @@ -1507,18 +1501,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 42a4009fd75a..b38f617b6181 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long)buf); - return -EFAULT; - } - buf[cnt] = '\0'; strim(buf); mutex_lock(&event_mutex); event_file = event_file_data(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); return -ENODEV; } ret = trigger_process_regex(event_file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); if (ret < 0) goto out; @@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } @@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob, (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 88fefa68c516..9bafc211930c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent *extent = NULL; - unsigned long page = 0; - char *kbuf, *pos, *next_line; + char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; /* @@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Get a buffer */ - ret = -ENOMEM; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!page) - goto out; - /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; /* Slurp in the user data */ - ret = -EFAULT; - if (copy_from_user(kbuf, buf, count)) + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) { + ret = PTR_ERR(kbuf); + kbuf = NULL; goto out; - kbuf[count] = '\0'; + } /* Parse the user data */ ret = -EINVAL; @@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = count; out: mutex_unlock(&userns_state_mutex); - if (page) - free_page(page); + kfree(kbuf); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84b5035cb6a5..b3ace6ebbba3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -225,7 +226,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c579dbab2e36..61a0264e28f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1385,6 +1389,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2157,6 +2163,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2240,6 +2248,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2256,9 +2265,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2316,6 +2330,37 @@ repeat: goto repeat; } +/** + * check_flush_dependency - check for flush dependency sanity + * @target_wq: workqueue being flushed + * @target_work: work item being flushed (NULL for workqueue flushes) + * + * %current is trying to flush the whole @target_wq or @target_work on it. + * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not + * reclaiming memory or running on a workqueue which doesn't have + * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to + * a deadlock. + */ +static void check_flush_dependency(struct workqueue_struct *target_wq, + struct work_struct *target_work) +{ + work_func_t target_func = target_work ? target_work->func : NULL; + struct worker *worker; + + if (target_wq->flags & WQ_MEM_RECLAIM) + return; + + worker = current_wq_worker(); + + WARN_ONCE(current->flags & PF_MEMALLOC, + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + current->pid, current->comm, target_wq->name, target_func); + WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + worker->current_pwq->wq->name, worker->current_func, + target_wq->name, target_func); +} + struct wq_barrier { struct work_struct work; struct completion done; @@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } + check_flush_dependency(wq, NULL); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) pwq = worker->current_pwq; } + check_flush_dependency(pwq->wq, work); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); @@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; - int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs); + if (!ctx) + return -ENOMEM; /* the ctx has been prepared successfully, let's commit it */ - if (ctx) { - apply_wqattrs_commit(ctx); - ret = 0; - } - + apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); - return ret; + return 0; } /** @@ -4308,7 +4355,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5290,6 +5487,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); |