diff options
Diffstat (limited to 'kernel')
114 files changed, 10988 insertions, 1549 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..04bc07c2b42a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += dma/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -112,7 +113,9 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o +obj-$(CONFIG_RSEQ) += rseq.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/audit.c b/kernel/audit.c index 670665c6e2a6..e7478cb58079 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1099,8 +1099,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature if (audit_enabled == AUDIT_OFF) return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE); if (!ab) return; audit_log_task_info(ab, current); @@ -2317,8 +2316,7 @@ void audit_log_link_denied(const char *operation) return; /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ - ab = audit_log_start(current->audit_context, GFP_KERNEL, - AUDIT_ANOM_LINK); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK); if (!ab) return; audit_log_format(ab, "op=%s", operation); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 52f368b6561e..fba78047fb37 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -109,7 +109,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true); + ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true); if (ret < 0) { fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); @@ -165,12 +165,11 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) /* Update mark data in audit rules based on fsnotify events. */ static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct audit_fsnotify_mark *audit_mark; const struct inode *inode = NULL; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 67e6956c0b61..c99ebaae5abc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -288,8 +288,8 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode, - NULL, 1)) { + if (fsnotify_add_inode_mark_locked(&new->mark, entry->connector->inode, + 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -354,7 +354,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; entry = &chunk->mark; - if (fsnotify_add_mark(entry, inode, NULL, 0)) { + if (fsnotify_add_inode_mark(entry, inode, 0)) { fsnotify_put_mark(entry); return -ENOSPC; } @@ -434,8 +434,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return -ENOENT; } - if (fsnotify_add_mark_locked(chunk_entry, - old_entry->connector->inode, NULL, 1)) { + if (fsnotify_add_inode_mark_locked(chunk_entry, + old_entry->connector->inode, 1)) { spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); @@ -989,8 +989,6 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 9eb8b3511636..c17c0c268436 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -160,7 +160,7 @@ static struct audit_parent *audit_init_parent(struct path *path) fsnotify_init_mark(&parent->mark, audit_watch_group); parent->mark.mask = AUDIT_FS_WATCH; - ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0); + ret = fsnotify_add_inode_mark(&parent->mark, inode, 0); if (ret < 0) { audit_free_parent(parent); return ERR_PTR(ret); @@ -274,7 +274,7 @@ static void audit_update_watch(struct audit_parent *parent, /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ if (invalidating && !audit_dummy_context()) - audit_filter_inodes(current, current->audit_context); + audit_filter_inodes(current, audit_context()); /* updating ino will likely change which audit_hash_list we * are on so we need a new watch for the new list */ @@ -472,12 +472,11 @@ void audit_remove_watch_rule(struct audit_krule *krule) /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, const unsigned char *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { + struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); const struct inode *inode; struct audit_parent *parent; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d7a807e81451..eaa320148d97 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -426,7 +426,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return -EINVAL; break; case AUDIT_EXE: - if (f->op != Audit_equal) + if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; if (entry->rule.listnr != AUDIT_FILTER_EXIT) return -EINVAL; @@ -1089,8 +1089,6 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) static void audit_log_rule_change(char *action, struct audit_krule *rule, int res) { struct audit_buffer *ab; - uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -1098,7 +1096,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid); + audit_log_session_info(ab); audit_log_task_context(ab); audit_log_format(ab, " op=%s", action); audit_log_key(ab, rule->filterkey); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4e0a4ac803db..ceb1c4596c51 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -374,7 +374,7 @@ static int audit_field_compare(struct task_struct *tsk, case AUDIT_COMPARE_EGID_TO_OBJ_GID: return audit_compare_gid(cred->egid, name, f, ctx); case AUDIT_COMPARE_AUID_TO_OBJ_UID: - return audit_compare_uid(tsk->loginuid, name, f, ctx); + return audit_compare_uid(audit_get_loginuid(tsk), name, f, ctx); case AUDIT_COMPARE_SUID_TO_OBJ_UID: return audit_compare_uid(cred->suid, name, f, ctx); case AUDIT_COMPARE_SGID_TO_OBJ_GID: @@ -385,7 +385,8 @@ static int audit_field_compare(struct task_struct *tsk, return audit_compare_gid(cred->fsgid, name, f, ctx); /* uid comparisons */ case AUDIT_COMPARE_UID_TO_AUID: - return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); + return audit_uid_comparator(cred->uid, f->op, + audit_get_loginuid(tsk)); case AUDIT_COMPARE_UID_TO_EUID: return audit_uid_comparator(cred->uid, f->op, cred->euid); case AUDIT_COMPARE_UID_TO_SUID: @@ -394,11 +395,14 @@ static int audit_field_compare(struct task_struct *tsk, return audit_uid_comparator(cred->uid, f->op, cred->fsuid); /* auid comparisons */ case AUDIT_COMPARE_AUID_TO_EUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->euid); case AUDIT_COMPARE_AUID_TO_SUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->suid); case AUDIT_COMPARE_AUID_TO_FSUID: - return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); + return audit_uid_comparator(audit_get_loginuid(tsk), f->op, + cred->fsuid); /* euid comparisons */ case AUDIT_COMPARE_EUID_TO_SUID: return audit_uid_comparator(cred->euid, f->op, cred->suid); @@ -471,6 +475,8 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_EXE: result = audit_exe_compare(tsk, rule->exe); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_UID: result = audit_uid_comparator(cred->uid, f->op, f->uid); @@ -511,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_gid_comparator(cred->fsgid, f->op, f->gid); break; case AUDIT_SESSIONID: - sessionid = audit_get_sessionid(current); + sessionid = audit_get_sessionid(tsk); result = audit_comparator(sessionid, f->op, f->val); break; case AUDIT_PERS: @@ -609,7 +615,8 @@ static int audit_filter_rules(struct task_struct *tsk, result = match_tree_refs(ctx, rule->tree); break; case AUDIT_LOGINUID: - result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); + result = audit_uid_comparator(audit_get_loginuid(tsk), + f->op, f->uid); break; case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); @@ -863,7 +870,7 @@ static inline struct audit_context *audit_take_context(struct task_struct *tsk, audit_filter_inodes(tsk, context); } - tsk->audit_context = NULL; + audit_set_context(tsk, NULL); return context; } @@ -950,7 +957,7 @@ int audit_alloc(struct task_struct *tsk) } context->filterkey = key; - tsk->audit_context = context; + audit_set_context(tsk, context); set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); return 0; } @@ -1507,8 +1514,7 @@ void __audit_free(struct task_struct *tsk) void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { - struct task_struct *tsk = current; - struct audit_context *context = tsk->audit_context; + struct audit_context *context = audit_context(); enum audit_state state; if (!audit_enabled || !context) @@ -1523,7 +1529,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, context->dummy = !audit_n_rules; if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { context->prio = 0; - if (auditd_test_task(tsk)) + if (auditd_test_task(current)) return; } @@ -1553,7 +1559,6 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, */ void __audit_syscall_exit(int success, long return_code) { - struct task_struct *tsk = current; struct audit_context *context; if (success) @@ -1561,12 +1566,12 @@ void __audit_syscall_exit(int success, long return_code) else success = AUDITSC_FAILURE; - context = audit_take_context(tsk, success, return_code); + context = audit_take_context(current, success, return_code); if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) - audit_log_exit(context, tsk); + audit_log_exit(context, current); context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; @@ -1588,7 +1593,7 @@ void __audit_syscall_exit(int success, long return_code) kfree(context->filterkey); context->filterkey = NULL; } - tsk->audit_context = context; + audit_set_context(current, context); } static inline void handle_one(const struct inode *inode) @@ -1600,7 +1605,7 @@ static inline void handle_one(const struct inode *inode) int count; if (likely(!inode->i_fsnotify_marks)) return; - context = current->audit_context; + context = audit_context(); p = context->trees; count = context->tree_count; rcu_read_lock(); @@ -1631,7 +1636,7 @@ static void handle_path(const struct dentry *dentry) unsigned long seq; int count; - context = current->audit_context; + context = audit_context(); p = context->trees; count = context->tree_count; retry: @@ -1713,7 +1718,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, struct filename * __audit_reusename(const __user char *uptr) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct audit_names *n; list_for_each_entry(n, &context->names_list, list) { @@ -1736,7 +1741,7 @@ __audit_reusename(const __user char *uptr) */ void __audit_getname(struct filename *name) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct audit_names *n; if (!context->in_syscall) @@ -1764,7 +1769,7 @@ void __audit_getname(struct filename *name) void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1863,7 +1868,7 @@ void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; @@ -2048,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - unsigned int oldsessionid, sessionid = (unsigned int)-1; + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; kuid_t oldloginuid; int rc; @@ -2062,7 +2067,7 @@ int audit_set_loginuid(kuid_t loginuid) /* are we setting or clearing? */ if (uid_valid(loginuid)) { sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == (unsigned int)-1)) + if (unlikely(sessionid == AUDIT_SID_UNSET)) sessionid = (unsigned int)atomic_inc_return(&session_id); } @@ -2082,7 +2087,7 @@ out: */ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (attr) memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); @@ -2106,7 +2111,7 @@ void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct timespec64 *p = &context->mq_sendrecv.abs_timeout; if (abs_timeout) @@ -2130,7 +2135,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (notification) context->mq_notify.sigev_signo = notification->sigev_signo; @@ -2149,7 +2154,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) */ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->mq_getsetattr.mqdes = mqdes; context->mq_getsetattr.mqstat = *mqstat; context->type = AUDIT_MQ_GETSETATTR; @@ -2162,7 +2167,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) */ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->ipc.uid = ipcp->uid; context->ipc.gid = ipcp->gid; context->ipc.mode = ipcp->mode; @@ -2182,7 +2187,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) */ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->ipc.qbytes = qbytes; context->ipc.perm_uid = uid; @@ -2193,7 +2198,7 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo void __audit_bprm(struct linux_binprm *bprm) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->type = AUDIT_EXECVE; context->execve.argc = bprm->argc; @@ -2208,7 +2213,7 @@ void __audit_bprm(struct linux_binprm *bprm) */ int __audit_socketcall(int nargs, unsigned long *args) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (nargs <= 0 || nargs > AUDITSC_ARGS || !args) return -EINVAL; @@ -2226,7 +2231,7 @@ int __audit_socketcall(int nargs, unsigned long *args) */ void __audit_fd_pair(int fd1, int fd2) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->fds[0] = fd1; context->fds[1] = fd2; } @@ -2240,7 +2245,7 @@ void __audit_fd_pair(int fd1, int fd2) */ int __audit_sockaddr(int len, void *a) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); @@ -2256,7 +2261,7 @@ int __audit_sockaddr(int len, void *a) void __audit_ptrace(struct task_struct *t) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->target_pid = task_tgid_nr(t); context->target_auid = audit_get_loginuid(t); @@ -2277,19 +2282,19 @@ void __audit_ptrace(struct task_struct *t) int audit_signal_info(int sig, struct task_struct *t) { struct audit_aux_data_pids *axp; - struct task_struct *tsk = current; - struct audit_context *ctx = tsk->audit_context; - kuid_t uid = current_uid(), t_uid = task_uid(t); + struct audit_context *ctx = audit_context(); + kuid_t uid = current_uid(), auid, t_uid = task_uid(t); if (auditd_test_task(t) && (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2)) { - audit_sig_pid = task_tgid_nr(tsk); - if (uid_valid(tsk->loginuid)) - audit_sig_uid = tsk->loginuid; + audit_sig_pid = task_tgid_nr(current); + auid = audit_get_loginuid(current); + if (uid_valid(auid)) + audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid(tsk, &audit_sig_sid); + security_task_getsecid(current, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) @@ -2345,7 +2350,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { struct audit_aux_data_bprm_fcaps *ax; - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); struct cpu_vfs_cap_data vcaps; ax = kmalloc(sizeof(*ax), GFP_KERNEL); @@ -2385,7 +2390,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, */ void __audit_log_capset(const struct cred *new, const struct cred *old) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->capset.pid = task_tgid_nr(current); context->capset.cap.effective = new->cap_effective; context->capset.cap.inheritable = new->cap_effective; @@ -2396,7 +2401,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old) void __audit_mmap_fd(int fd, int flags) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->mmap.fd = fd; context->mmap.flags = flags; context->type = AUDIT_MMAP; @@ -2404,7 +2409,7 @@ void __audit_mmap_fd(int fd, int flags) void __audit_log_kern_module(char *name) { - struct audit_context *context = current->audit_context; + struct audit_context *context = audit_context(); context->module.name = kmalloc(strlen(name) + 1, GFP_KERNEL); strcpy(context->module.name, name); @@ -2413,7 +2418,7 @@ void __audit_log_kern_module(char *name) void __audit_fanotify(unsigned int response) { - audit_log(current->audit_context, GFP_KERNEL, + audit_log(audit_context(), GFP_KERNEL, AUDIT_FANOTIFY, "resp=%u", response); } @@ -2464,7 +2469,19 @@ void audit_core_dumps(long signr) audit_log_end(ab); } -void __audit_seccomp(unsigned long syscall, long signr, int code) +/** + * audit_seccomp - record information about a seccomp action + * @syscall: syscall number + * @signr: signal value + * @code: the seccomp action + * + * Record the information associated with a seccomp action. Event filtering for + * seccomp actions that are not to be logged is done in seccomp_log(). + * Therefore, this function forces auditing independent of the audit_enabled + * and dummy context state because seccomp actions should be logged even when + * audit is not in use. + */ +void audit_seccomp(unsigned long syscall, long signr, int code) { struct audit_buffer *ab; @@ -2478,9 +2495,29 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) audit_log_end(ab); } +void audit_seccomp_actions_logged(const char *names, const char *old_names, + int res) +{ + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, + AUDIT_CONFIG_CHANGE); + if (unlikely(!ab)) + return; + + audit_log_format(ab, "op=seccomp-logging"); + audit_log_format(ab, " actions=%s", names); + audit_log_format(ab, " old-actions=%s", old_names); + audit_log_format(ab, " res=%d", res); + audit_log_end(ab); +} + struct list_head *audit_killed_trees(void) { - struct audit_context *ctx = current->audit_context; + struct audit_context *ctx = audit_context(); if (likely(!ctx || !ctx->in_syscall)) return NULL; return &ctx->killed_trees; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a713fd23ec88..f27f5496d6fe 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -4,9 +4,13 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o +ifeq ($(CONFIG_XDP_SOCKETS),y) +obj-$(CONFIG_BPF_SYSCALL) += xskmap.o +endif obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 027107f4be53..544e58f5f642 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -11,11 +11,13 @@ * General Public License for more details. */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/filter.h> #include <linux/perf_event.h> +#include <uapi/linux/btf.h> #include "map_in_map.h" @@ -336,6 +338,52 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } +static void array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void *value; + + rcu_read_lock(); + + value = array_map_lookup_elem(map, key); + if (!value) { + rcu_read_unlock(); + return; + } + + seq_printf(m, "%u: ", *(u32 *)key); + btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); + seq_puts(m, "\n"); + + rcu_read_unlock(); +} + +static int array_map_check_btf(const struct bpf_map *map, const struct btf *btf, + u32 btf_key_id, u32 btf_value_id) +{ + const struct btf_type *key_type, *value_type; + u32 key_size, value_size; + u32 int_data; + + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EINVAL; + + int_data = *(u32 *)(key_type + 1); + /* bpf array can only take a u32 key. This check makes + * sure that the btf matches the attr used during map_create. + */ + if (BTF_INT_BITS(int_data) != 32 || key_size != 4 || + BTF_INT_OFFSET(int_data)) + return -EINVAL; + + value_type = btf_type_id_size(btf, &btf_value_id, &value_size); + if (!value_type || value_size > map->value_size) + return -EINVAL; + + return 0; +} + const struct bpf_map_ops array_map_ops = { .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, @@ -345,6 +393,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_seq_show_elem = array_map_seq_show_elem, + .map_check_btf = array_map_check_btf, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c new file mode 100644 index 000000000000..2d49d18b793a --- /dev/null +++ b/kernel/bpf/btf.c @@ -0,0 +1,2348 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2018 Facebook */ + +#include <uapi/linux/btf.h> +#include <uapi/linux/types.h> +#include <linux/seq_file.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/idr.h> +#include <linux/sort.h> +#include <linux/bpf_verifier.h> +#include <linux/btf.h> + +/* BTF (BPF Type Format) is the meta data format which describes + * the data types of BPF program/map. Hence, it basically focus + * on the C programming language which the modern BPF is primary + * using. + * + * ELF Section: + * ~~~~~~~~~~~ + * The BTF data is stored under the ".BTF" ELF section + * + * struct btf_type: + * ~~~~~~~~~~~~~~~ + * Each 'struct btf_type' object describes a C data type. + * Depending on the type it is describing, a 'struct btf_type' + * object may be followed by more data. F.e. + * To describe an array, 'struct btf_type' is followed by + * 'struct btf_array'. + * + * 'struct btf_type' and any extra data following it are + * 4 bytes aligned. + * + * Type section: + * ~~~~~~~~~~~~~ + * The BTF type section contains a list of 'struct btf_type' objects. + * Each one describes a C type. Recall from the above section + * that a 'struct btf_type' object could be immediately followed by extra + * data in order to desribe some particular C types. + * + * type_id: + * ~~~~~~~ + * Each btf_type object is identified by a type_id. The type_id + * is implicitly implied by the location of the btf_type object in + * the BTF type section. The first one has type_id 1. The second + * one has type_id 2...etc. Hence, an earlier btf_type has + * a smaller type_id. + * + * A btf_type object may refer to another btf_type object by using + * type_id (i.e. the "type" in the "struct btf_type"). + * + * NOTE that we cannot assume any reference-order. + * A btf_type object can refer to an earlier btf_type object + * but it can also refer to a later btf_type object. + * + * For example, to describe "const void *". A btf_type + * object describing "const" may refer to another btf_type + * object describing "void *". This type-reference is done + * by specifying type_id: + * + * [1] CONST (anon) type_id=2 + * [2] PTR (anon) type_id=0 + * + * The above is the btf_verifier debug log: + * - Each line started with "[?]" is a btf_type object + * - [?] is the type_id of the btf_type object. + * - CONST/PTR is the BTF_KIND_XXX + * - "(anon)" is the name of the type. It just + * happens that CONST and PTR has no name. + * - type_id=XXX is the 'u32 type' in btf_type + * + * NOTE: "void" has type_id 0 + * + * String section: + * ~~~~~~~~~~~~~~ + * The BTF string section contains the names used by the type section. + * Each string is referred by an "offset" from the beginning of the + * string section. + * + * Each string is '\0' terminated. + * + * The first character in the string section must be '\0' + * which is used to mean 'anonymous'. Some btf_type may not + * have a name. + */ + +/* BTF verification: + * + * To verify BTF data, two passes are needed. + * + * Pass #1 + * ~~~~~~~ + * The first pass is to collect all btf_type objects to + * an array: "btf->types". + * + * Depending on the C type that a btf_type is describing, + * a btf_type may be followed by extra data. We don't know + * how many btf_type is there, and more importantly we don't + * know where each btf_type is located in the type section. + * + * Without knowing the location of each type_id, most verifications + * cannot be done. e.g. an earlier btf_type may refer to a later + * btf_type (recall the "const void *" above), so we cannot + * check this type-reference in the first pass. + * + * In the first pass, it still does some verifications (e.g. + * checking the name is a valid offset to the string section). + * + * Pass #2 + * ~~~~~~~ + * The main focus is to resolve a btf_type that is referring + * to another type. + * + * We have to ensure the referring type: + * 1) does exist in the BTF (i.e. in btf->types[]) + * 2) does not cause a loop: + * struct A { + * struct B b; + * }; + * + * struct B { + * struct A a; + * }; + * + * btf_type_needs_resolve() decides if a btf_type needs + * to be resolved. + * + * The needs_resolve type implements the "resolve()" ops which + * essentially does a DFS and detects backedge. + * + * During resolve (or DFS), different C types have different + * "RESOLVED" conditions. + * + * When resolving a BTF_KIND_STRUCT, we need to resolve all its + * members because a member is always referring to another + * type. A struct's member can be treated as "RESOLVED" if + * it is referring to a BTF_KIND_PTR. Otherwise, the + * following valid C struct would be rejected: + * + * struct A { + * int m; + * struct A *a; + * }; + * + * When resolving a BTF_KIND_PTR, it needs to keep resolving if + * it is referring to another BTF_KIND_PTR. Otherwise, we cannot + * detect a pointer loop, e.g.: + * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + + * ^ | + * +-----------------------------------------+ + * + */ + +#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +#define BITS_ROUNDUP_BYTES(bits) \ + (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) + +#define BTF_INFO_MASK 0x0f00ffff +#define BTF_INT_MASK 0x0fffffff +#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) + +/* 16MB for 64k structs and each has 16 members and + * a few MB spaces for the string section. + * The hard limit is S32_MAX. + */ +#define BTF_MAX_SIZE (16 * 1024 * 1024) + +#define for_each_member(i, struct_type, member) \ + for (i = 0, member = btf_type_member(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_member_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_member(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +static DEFINE_IDR(btf_idr); +static DEFINE_SPINLOCK(btf_idr_lock); + +struct btf { + void *data; + struct btf_type **types; + u32 *resolved_ids; + u32 *resolved_sizes; + const char *strings; + void *nohdr_data; + struct btf_header hdr; + u32 nr_types; + u32 types_size; + u32 data_size; + refcount_t refcnt; + u32 id; + struct rcu_head rcu; +}; + +enum verifier_phase { + CHECK_META, + CHECK_TYPE, +}; + +struct resolve_vertex { + const struct btf_type *t; + u32 type_id; + u16 next_member; +}; + +enum visit_state { + NOT_VISITED, + VISITED, + RESOLVED, +}; + +enum resolve_mode { + RESOLVE_TBD, /* To Be Determined */ + RESOLVE_PTR, /* Resolving for Pointer */ + RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union + * or array + */ +}; + +#define MAX_RESOLVE_DEPTH 32 + +struct btf_sec_info { + u32 off; + u32 len; +}; + +struct btf_verifier_env { + struct btf *btf; + u8 *visit_states; + struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; + struct bpf_verifier_log log; + u32 log_type_id; + u32 top_stack; + enum verifier_phase phase; + enum resolve_mode resolve_mode; +}; + +static const char * const btf_kind_str[NR_BTF_KINDS] = { + [BTF_KIND_UNKN] = "UNKNOWN", + [BTF_KIND_INT] = "INT", + [BTF_KIND_PTR] = "PTR", + [BTF_KIND_ARRAY] = "ARRAY", + [BTF_KIND_STRUCT] = "STRUCT", + [BTF_KIND_UNION] = "UNION", + [BTF_KIND_ENUM] = "ENUM", + [BTF_KIND_FWD] = "FWD", + [BTF_KIND_TYPEDEF] = "TYPEDEF", + [BTF_KIND_VOLATILE] = "VOLATILE", + [BTF_KIND_CONST] = "CONST", + [BTF_KIND_RESTRICT] = "RESTRICT", +}; + +struct btf_kind_operations { + s32 (*check_meta)(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left); + int (*resolve)(struct btf_verifier_env *env, + const struct resolve_vertex *v); + int (*check_member)(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type); + void (*log_details)(struct btf_verifier_env *env, + const struct btf_type *t); + void (*seq_show)(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m); +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; +static struct btf_type btf_void; + +static bool btf_type_is_modifier(const struct btf_type *t) +{ + /* Some of them is not strictly a C modifier + * but they are grouped into the same bucket + * for BTF concern: + * A type (t) that refers to another + * type through t->type AND its size cannot + * be determined without following the t->type. + * + * ptr does not fall into this bucket + * because its size is always sizeof(void *). + */ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + return true; + } + + return false; +} + +static bool btf_type_is_void(const struct btf_type *t) +{ + /* void => no type and size info. + * Hence, FWD is also treated as void. + */ + return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +} + +static bool btf_type_is_void_or_null(const struct btf_type *t) +{ + return !t || btf_type_is_void(t); +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + +static bool btf_type_is_ptr(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +} + +static bool btf_type_is_int(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +} + +/* What types need to be resolved? + * + * btf_type_is_modifier() is an obvious one. + * + * btf_type_is_struct() because its member refers to + * another type (through member->type). + + * btf_type_is_array() because its element (array->type) + * refers to another type. Array can be thought of a + * special case of struct while array just has the same + * member-type repeated by array->nelems of times. + */ +static bool btf_type_needs_resolve(const struct btf_type *t) +{ + return btf_type_is_modifier(t) || + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t); +} + +/* t->size can be used */ +static bool btf_type_has_size(const struct btf_type *t) +{ + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + return true; + } + + return false; +} + +static const char *btf_int_encoding_str(u8 encoding) +{ + if (encoding == 0) + return "(none)"; + else if (encoding == BTF_INT_SIGNED) + return "SIGNED"; + else if (encoding == BTF_INT_CHAR) + return "CHAR"; + else if (encoding == BTF_INT_BOOL) + return "BOOL"; + else + return "UNKN"; +} + +static u16 btf_type_vlen(const struct btf_type *t) +{ + return BTF_INFO_VLEN(t->info); +} + +static u32 btf_type_int(const struct btf_type *t) +{ + return *(u32 *)(t + 1); +} + +static const struct btf_array *btf_type_array(const struct btf_type *t) +{ + return (const struct btf_array *)(t + 1); +} + +static const struct btf_member *btf_type_member(const struct btf_type *t) +{ + return (const struct btf_member *)(t + 1); +} + +static const struct btf_enum *btf_type_enum(const struct btf_type *t) +{ + return (const struct btf_enum *)(t + 1); +} + +static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) +{ + return kind_ops[BTF_INFO_KIND(t->info)]; +} + +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +{ + return BTF_STR_OFFSET_VALID(offset) && + offset < btf->hdr.str_len; +} + +static const char *btf_name_by_offset(const struct btf *btf, u32 offset) +{ + if (!offset) + return "(anon)"; + else if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + else + return "(invalid-name-offset)"; +} + +static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +{ + if (type_id > btf->nr_types) + return NULL; + + return btf->types[type_id]; +} + +/* + * Regular int is not a bit field and it must be either + * u8/u16/u32/u64. + */ +static bool btf_type_int_is_regular(const struct btf_type *t) +{ + u16 nr_bits, nr_bytes; + u32 int_data; + + int_data = btf_type_int(t); + nr_bits = BTF_INT_BITS(int_data); + nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + if (BITS_PER_BYTE_MASKED(nr_bits) || + BTF_INT_OFFSET(int_data) || + (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + return false; + } + + return true; +} + +__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} + +__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, + const struct btf_type *t, + bool log_details, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + u8 kind = BTF_INFO_KIND(t->info); + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + __btf_verifier_log(log, "[%u] %s %s%s", + env->log_type_id, + btf_kind_str[kind], + btf_name_by_offset(btf, t->name_off), + log_details ? " " : ""); + + if (log_details) + btf_type_ops(t)->log_details(env, t); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +#define btf_verifier_log_type(env, t, ...) \ + __btf_verifier_log_type((env), (t), true, __VA_ARGS__) +#define btf_verifier_log_basic(env, t, ...) \ + __btf_verifier_log_type((env), (t), false, __VA_ARGS__) + +__printf(4, 5) +static void btf_verifier_log_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + struct btf *btf = env->btf; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + /* The CHECK_META phase already did a btf dump. + * + * If member is logged again, it must hit an error in + * parsing this member. It is useful to print out which + * struct this member belongs to. + */ + if (env->phase != CHECK_META) + btf_verifier_log_type(env, struct_type, NULL); + + __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", + btf_name_by_offset(btf, member->name_off), + member->type, member->offset); + + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + +static void btf_verifier_log_hdr(struct btf_verifier_env *env, + u32 btf_data_size) +{ + struct bpf_verifier_log *log = &env->log; + const struct btf *btf = env->btf; + const struct btf_header *hdr; + + if (!bpf_verifier_log_needed(log)) + return; + + hdr = &btf->hdr; + __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); + __btf_verifier_log(log, "version: %u\n", hdr->version); + __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); + __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); + __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); + __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); + __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); + __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); + __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); +} + +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) +{ + struct btf *btf = env->btf; + + /* < 2 because +1 for btf_void which is always in btf->types[0]. + * btf_void is not accounted in btf->nr_types because btf_void + * does not come from the BTF file. + */ + if (btf->types_size - btf->nr_types < 2) { + /* Expand 'types' array */ + + struct btf_type **new_types; + u32 expand_by, new_size; + + if (btf->types_size == BTF_MAX_TYPE) { + btf_verifier_log(env, "Exceeded max num of types"); + return -E2BIG; + } + + expand_by = max_t(u32, btf->types_size >> 2, 16); + new_size = min_t(u32, BTF_MAX_TYPE, + btf->types_size + expand_by); + + new_types = kvcalloc(new_size, sizeof(*new_types), + GFP_KERNEL | __GFP_NOWARN); + if (!new_types) + return -ENOMEM; + + if (btf->nr_types == 0) + new_types[0] = &btf_void; + else + memcpy(new_types, btf->types, + sizeof(*btf->types) * (btf->nr_types + 1)); + + kvfree(btf->types); + btf->types = new_types; + btf->types_size = new_size; + } + + btf->types[++(btf->nr_types)] = t; + + return 0; +} + +static int btf_alloc_id(struct btf *btf) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock_bh(&btf_idr_lock); + id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); + if (id > 0) + btf->id = id; + spin_unlock_bh(&btf_idr_lock); + idr_preload_end(); + + if (WARN_ON_ONCE(!id)) + return -ENOSPC; + + return id > 0 ? 0 : id; +} + +static void btf_free_id(struct btf *btf) +{ + unsigned long flags; + + /* + * In map-in-map, calling map_delete_elem() on outer + * map will call bpf_map_put on the inner map. + * It will then eventually call btf_free_id() + * on the inner map. Some of the map_delete_elem() + * implementation may have irq disabled, so + * we need to use the _irqsave() version instead + * of the _bh() version. + */ + spin_lock_irqsave(&btf_idr_lock, flags); + idr_remove(&btf_idr, btf->id); + spin_unlock_irqrestore(&btf_idr_lock, flags); +} + +static void btf_free(struct btf *btf) +{ + kvfree(btf->types); + kvfree(btf->resolved_sizes); + kvfree(btf->resolved_ids); + kvfree(btf->data); + kfree(btf); +} + +static void btf_free_rcu(struct rcu_head *rcu) +{ + struct btf *btf = container_of(rcu, struct btf, rcu); + + btf_free(btf); +} + +void btf_put(struct btf *btf) +{ + if (btf && refcount_dec_and_test(&btf->refcnt)) { + btf_free_id(btf); + call_rcu(&btf->rcu, btf_free_rcu); + } +} + +static int env_resolve_init(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 nr_types = btf->nr_types; + u32 *resolved_sizes = NULL; + u32 *resolved_ids = NULL; + u8 *visit_states = NULL; + + /* +1 for btf_void */ + resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_sizes) + goto nomem; + + resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + GFP_KERNEL | __GFP_NOWARN); + if (!resolved_ids) + goto nomem; + + visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + GFP_KERNEL | __GFP_NOWARN); + if (!visit_states) + goto nomem; + + btf->resolved_sizes = resolved_sizes; + btf->resolved_ids = resolved_ids; + env->visit_states = visit_states; + + return 0; + +nomem: + kvfree(resolved_sizes); + kvfree(resolved_ids); + kvfree(visit_states); + return -ENOMEM; +} + +static void btf_verifier_env_free(struct btf_verifier_env *env) +{ + kvfree(env->visit_states); + kfree(env); +} + +static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, + const struct btf_type *next_type) +{ + switch (env->resolve_mode) { + case RESOLVE_TBD: + /* int, enum or void is a sink */ + return !btf_type_needs_resolve(next_type); + case RESOLVE_PTR: + /* int, enum, void, struct or array is a sink for ptr */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_ptr(next_type); + case RESOLVE_STRUCT_OR_ARRAY: + /* int, enum, void or ptr is a sink for struct and array */ + return !btf_type_is_modifier(next_type) && + !btf_type_is_array(next_type) && + !btf_type_is_struct(next_type); + default: + BUG(); + } +} + +static bool env_type_is_resolved(const struct btf_verifier_env *env, + u32 type_id) +{ + return env->visit_states[type_id] == RESOLVED; +} + +static int env_stack_push(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + struct resolve_vertex *v; + + if (env->top_stack == MAX_RESOLVE_DEPTH) + return -E2BIG; + + if (env->visit_states[type_id] != NOT_VISITED) + return -EEXIST; + + env->visit_states[type_id] = VISITED; + + v = &env->stack[env->top_stack++]; + v->t = t; + v->type_id = type_id; + v->next_member = 0; + + if (env->resolve_mode == RESOLVE_TBD) { + if (btf_type_is_ptr(t)) + env->resolve_mode = RESOLVE_PTR; + else if (btf_type_is_struct(t) || btf_type_is_array(t)) + env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; + } + + return 0; +} + +static void env_stack_set_next_member(struct btf_verifier_env *env, + u16 next_member) +{ + env->stack[env->top_stack - 1].next_member = next_member; +} + +static void env_stack_pop_resolved(struct btf_verifier_env *env, + u32 resolved_type_id, + u32 resolved_size) +{ + u32 type_id = env->stack[--(env->top_stack)].type_id; + struct btf *btf = env->btf; + + btf->resolved_sizes[type_id] = resolved_size; + btf->resolved_ids[type_id] = resolved_type_id; + env->visit_states[type_id] = RESOLVED; +} + +static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) +{ + return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; +} + +/* The input param "type_id" must point to a needs_resolve type */ +static const struct btf_type *btf_type_id_resolve(const struct btf *btf, + u32 *type_id) +{ + *type_id = btf->resolved_ids[*type_id]; + return btf_type_by_id(btf, *type_id); +} + +const struct btf_type *btf_type_id_size(const struct btf *btf, + u32 *type_id, u32 *ret_size) +{ + const struct btf_type *size_type; + u32 size_type_id = *type_id; + u32 size = 0; + + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void_or_null(size_type)) + return NULL; + + if (btf_type_has_size(size_type)) { + size = size_type->size; + } else if (btf_type_is_array(size_type)) { + size = btf->resolved_sizes[size_type_id]; + } else if (btf_type_is_ptr(size_type)) { + size = sizeof(void *); + } else { + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + return NULL; + + size = btf->resolved_sizes[size_type_id]; + size_type_id = btf->resolved_ids[size_type_id]; + size_type = btf_type_by_id(btf, size_type_id); + if (btf_type_is_void(size_type)) + return NULL; + } + + *type_id = size_type_id; + if (ret_size) + *ret_size = size; + + return size_type; +} + +static int btf_df_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + btf_verifier_log_basic(env, struct_type, + "Unsupported check_member"); + return -EINVAL; +} + +static int btf_df_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + btf_verifier_log_basic(env, v->t, "Unsupported resolve"); + return -EINVAL; +} + +static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offsets, + struct seq_file *m) +{ + seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info)); +} + +static int btf_int_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 int_data = btf_type_int(member_type); + u32 struct_bits_off = member->offset; + u32 struct_size = struct_type->size; + u32 nr_copy_bits; + u32 bytes_offset; + + if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { + btf_verifier_log_member(env, struct_type, member, + "bits_offset exceeds U32_MAX"); + return -EINVAL; + } + + struct_bits_off += BTF_INT_OFFSET(int_data); + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + nr_copy_bits = BTF_INT_BITS(int_data) + + BITS_PER_BYTE_MASKED(struct_bits_off); + + if (nr_copy_bits > BITS_PER_U64) { + btf_verifier_log_member(env, struct_type, member, + "nr_copy_bits exceeds 64"); + return -EINVAL; + } + + if (struct_size < bytes_offset || + struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_int_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 int_data, nr_bits, meta_needed = sizeof(int_data); + u16 encoding; + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + int_data = btf_type_int(t); + if (int_data & ~BTF_INT_MASK) { + btf_verifier_log_basic(env, t, "Invalid int_data:%x", + int_data); + return -EINVAL; + } + + nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); + + if (nr_bits > BITS_PER_U64) { + btf_verifier_log_type(env, t, "nr_bits exceeds %zu", + BITS_PER_U64); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { + btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); + return -EINVAL; + } + + /* + * Only one of the encoding bits is allowed and it + * should be sufficient for the pretty print purpose (i.e. decoding). + * Multiple bits can be allowed later if it is found + * to be insufficient. + */ + encoding = BTF_INT_ENCODING(int_data); + if (encoding && + encoding != BTF_INT_SIGNED && + encoding != BTF_INT_CHAR && + encoding != BTF_INT_BOOL) { + btf_verifier_log_type(env, t, "Unsupported encoding"); + return -ENOTSUPP; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_int_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + int int_data = btf_type_int(t); + + btf_verifier_log(env, + "size=%u bits_offset=%u nr_bits=%u encoding=%s", + t->size, BTF_INT_OFFSET(int_data), + BTF_INT_BITS(int_data), + btf_int_encoding_str(BTF_INT_ENCODING(int_data))); +} + +static void btf_int_bits_seq_show(const struct btf *btf, + const struct btf_type *t, + void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u16 nr_bits = BTF_INT_BITS(int_data); + u16 total_bits_offset; + u16 nr_copy_bytes; + u16 nr_copy_bits; + u8 nr_upper_bits; + union { + u64 u64_num; + u8 u8_nums[8]; + } print_num; + + total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + nr_copy_bits = nr_bits + bits_offset; + nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); + + print_num.u64_num = 0; + memcpy(&print_num.u64_num, data, nr_copy_bytes); + + /* Ditch the higher order bits */ + nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits); + if (nr_upper_bits) { + /* We need to mask out some bits of the upper byte. */ + u8 mask = (1 << nr_upper_bits) - 1; + + print_num.u8_nums[nr_copy_bytes - 1] &= mask; + } + + print_num.u64_num >>= bits_offset; + + seq_printf(m, "0x%llx", print_num.u64_num); +} + +static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + u32 int_data = btf_type_int(t); + u8 encoding = BTF_INT_ENCODING(int_data); + bool sign = encoding & BTF_INT_SIGNED; + u32 nr_bits = BTF_INT_BITS(int_data); + + if (bits_offset || BTF_INT_OFFSET(int_data) || + BITS_PER_BYTE_MASKED(nr_bits)) { + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + return; + } + + switch (nr_bits) { + case 64: + if (sign) + seq_printf(m, "%lld", *(s64 *)data); + else + seq_printf(m, "%llu", *(u64 *)data); + break; + case 32: + if (sign) + seq_printf(m, "%d", *(s32 *)data); + else + seq_printf(m, "%u", *(u32 *)data); + break; + case 16: + if (sign) + seq_printf(m, "%d", *(s16 *)data); + else + seq_printf(m, "%u", *(u16 *)data); + break; + case 8: + if (sign) + seq_printf(m, "%d", *(s8 *)data); + else + seq_printf(m, "%u", *(u8 *)data); + break; + default: + btf_int_bits_seq_show(btf, t, data, bits_offset, m); + } +} + +static const struct btf_kind_operations int_ops = { + .check_meta = btf_int_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_int_check_member, + .log_details = btf_int_log, + .seq_show = btf_int_seq_show, +}; + +static int btf_modifier_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + const struct btf_type *resolved_type; + u32 resolved_type_id = member->type; + struct btf_member resolved_member; + struct btf *btf = env->btf; + + resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); + if (!resolved_type) { + btf_verifier_log_member(env, struct_type, member, + "Invalid member"); + return -EINVAL; + } + + resolved_member = *member; + resolved_member.type = resolved_type_id; + + return btf_type_ops(resolved_type)->check_member(env, struct_type, + &resolved_member, + resolved_type); +} + +static int btf_ptr_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_size, struct_bits_off, bytes_offset; + + struct_size = struct_type->size; + struct_bits_off = member->offset; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + if (struct_size - bytes_offset < sizeof(void *)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static int btf_ref_type_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (!BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static int btf_modifier_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + const struct btf_type *next_type; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "typedef void new_void", "const void"...etc */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* Figure out the resolved next_type_id with size. + * They will be stored in the current modifier's + * resolved_ids and resolved_sizes such that it can + * save us a few type-following when we use it later (e.g. in + * pretty print). + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + +static int btf_ptr_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size = 0; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + /* "void *" */ + if (btf_type_is_void(next_type)) + goto resolved; + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, + * the modifier may have stopped resolving when it was resolved + * to a ptr (last-resolved-ptr). + * + * We now need to continue from the last-resolved-ptr to + * ensure the last-resolved-ptr will not referring back to + * the currenct ptr (t). + */ + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + if (!btf_type_id_size(btf, &next_type_id, &next_type_size) && + !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + +resolved: + env_stack_pop_resolved(env, next_type_id, 0); + + return 0; +} + +static void btf_modifier_seq_show(const struct btf *btf, + const struct btf_type *t, + u32 type_id, void *data, + u8 bits_offset, struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + +static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + /* It is a hashed value */ + seq_printf(m, "%p", *(void **)data); +} + +static void btf_ref_type_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "type_id=%u", t->type); +} + +static struct btf_kind_operations modifier_ops = { + .check_meta = btf_ref_type_check_meta, + .resolve = btf_modifier_resolve, + .check_member = btf_modifier_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_modifier_seq_show, +}; + +static struct btf_kind_operations ptr_ops = { + .check_meta = btf_ref_type_check_meta, + .resolve = btf_ptr_resolve, + .check_member = btf_ptr_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_ptr_seq_show, +}; + +static s32 btf_fwd_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (t->type) { + btf_verifier_log_type(env, t, "type != 0"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return 0; +} + +static struct btf_kind_operations fwd_ops = { + .check_meta = btf_fwd_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_df_check_member, + .log_details = btf_ref_type_log, + .seq_show = btf_df_seq_show, +}; + +static int btf_array_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + u32 array_type_id, array_size; + struct btf *btf = env->btf; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + array_type_id = member->type; + btf_type_id_size(btf, &array_type_id, &array_size); + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < array_size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_array_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_array *array = btf_type_array(t); + u32 meta_needed = sizeof(*array); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (t->size) { + btf_verifier_log_type(env, t, "size != 0"); + return -EINVAL; + } + + /* Array elem type and index type cannot be in type void, + * so !array->type and !array->index_type are not allowed. + */ + if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { + btf_verifier_log_type(env, t, "Invalid elem"); + return -EINVAL; + } + + if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { + btf_verifier_log_type(env, t, "Invalid index"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static int btf_array_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_array *array = btf_type_array(v->t); + const struct btf_type *elem_type, *index_type; + u32 elem_type_id, index_type_id; + struct btf *btf = env->btf; + u32 elem_size; + + /* Check array->index_type */ + index_type_id = array->index_type; + index_type = btf_type_by_id(btf, index_type_id); + if (btf_type_is_void_or_null(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, index_type) && + !env_type_is_resolved(env, index_type_id)) + return env_stack_push(env, index_type, index_type_id); + + index_type = btf_type_id_size(btf, &index_type_id, NULL); + if (!index_type || !btf_type_is_int(index_type) || + !btf_type_int_is_regular(index_type)) { + btf_verifier_log_type(env, v->t, "Invalid index"); + return -EINVAL; + } + + /* Check array->type */ + elem_type_id = array->type; + elem_type = btf_type_by_id(btf, elem_type_id); + if (btf_type_is_void_or_null(elem_type)) { + btf_verifier_log_type(env, v->t, + "Invalid elem"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, elem_type) && + !env_type_is_resolved(env, elem_type_id)) + return env_stack_push(env, elem_type, elem_type_id); + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + if (!elem_type) { + btf_verifier_log_type(env, v->t, "Invalid elem"); + return -EINVAL; + } + + if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { + btf_verifier_log_type(env, v->t, "Invalid array of int"); + return -EINVAL; + } + + if (array->nelems && elem_size > U32_MAX / array->nelems) { + btf_verifier_log_type(env, v->t, + "Array size overflows U32_MAX"); + return -EINVAL; + } + + env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); + + return 0; +} + +static void btf_array_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + const struct btf_array *array = btf_type_array(t); + + btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", + array->type, array->index_type, array->nelems); +} + +static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_array *array = btf_type_array(t); + const struct btf_kind_operations *elem_ops; + const struct btf_type *elem_type; + u32 i, elem_size, elem_type_id; + + elem_type_id = array->type; + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + elem_ops = btf_type_ops(elem_type); + seq_puts(m, "["); + for (i = 0; i < array->nelems; i++) { + if (i) + seq_puts(m, ","); + + elem_ops->seq_show(btf, elem_type, elem_type_id, data, + bits_offset, m); + data += elem_size; + } + seq_puts(m, "]"); +} + +static struct btf_kind_operations array_ops = { + .check_meta = btf_array_check_meta, + .resolve = btf_array_resolve, + .check_member = btf_array_check_member, + .log_details = btf_array_log, + .seq_show = btf_array_seq_show, +}; + +static int btf_struct_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < member_type->size) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_struct_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; + const struct btf_member *member; + struct btf *btf = env->btf; + u32 struct_size = t->size; + u32 meta_needed; + u16 i; + + meta_needed = btf_type_vlen(t) * sizeof(*member); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_member(i, t, member) { + if (!btf_name_offset_valid(btf, member->name_off)) { + btf_verifier_log_member(env, t, member, + "Invalid member name_offset:%u", + member->name_off); + return -EINVAL; + } + + /* A member cannot be in type void */ + if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { + btf_verifier_log_member(env, t, member, + "Invalid type_id"); + return -EINVAL; + } + + if (is_union && member->offset) { + btf_verifier_log_member(env, t, member, + "Invalid member bits_offset"); + return -EINVAL; + } + + if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) { + btf_verifier_log_member(env, t, member, + "Memmber bits_offset exceeds its struct size"); + return -EINVAL; + } + + btf_verifier_log_member(env, t, member, NULL); + } + + return meta_needed; +} + +static int btf_struct_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_member *member; + int err; + u16 i; + + /* Before continue resolving the next_member, + * ensure the last member is indeed resolved to a + * type with size info. + */ + if (v->next_member) { + const struct btf_type *last_member_type; + const struct btf_member *last_member; + u16 last_member_type_id; + + last_member = btf_type_member(v->t) + v->next_member - 1; + last_member_type_id = last_member->type; + if (WARN_ON_ONCE(!env_type_is_resolved(env, + last_member_type_id))) + return -EINVAL; + + last_member_type = btf_type_by_id(env->btf, + last_member_type_id); + err = btf_type_ops(last_member_type)->check_member(env, v->t, + last_member, + last_member_type); + if (err) + return err; + } + + for_each_member_from(i, v->next_member, v->t, member) { + u32 member_type_id = member->type; + const struct btf_type *member_type = btf_type_by_id(env->btf, + member_type_id); + + if (btf_type_is_void_or_null(member_type)) { + btf_verifier_log_member(env, v->t, member, + "Invalid member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, member_type) && + !env_type_is_resolved(env, member_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, member_type, member_type_id); + } + + err = btf_type_ops(member_type)->check_member(env, v->t, + member, + member_type); + if (err) + return err; + } + + env_stack_pop_resolved(env, 0, 0); + + return 0; +} + +static void btf_struct_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; + const struct btf_member *member; + u32 i; + + seq_puts(m, "{"); + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + u32 member_offset = member->offset; + u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset); + const struct btf_kind_operations *ops; + + if (i) + seq_puts(m, seq); + + ops = btf_type_ops(member_type); + ops->seq_show(btf, member_type, member->type, + data + bytes_offset, bits8_offset, m); + } + seq_puts(m, "}"); +} + +static struct btf_kind_operations struct_ops = { + .check_meta = btf_struct_check_meta, + .resolve = btf_struct_resolve, + .check_member = btf_struct_check_member, + .log_details = btf_struct_log, + .seq_show = btf_struct_seq_show, +}; + +static int btf_enum_check_member(struct btf_verifier_env *env, + const struct btf_type *struct_type, + const struct btf_member *member, + const struct btf_type *member_type) +{ + u32 struct_bits_off = member->offset; + u32 struct_size, bytes_offset; + + if (BITS_PER_BYTE_MASKED(struct_bits_off)) { + btf_verifier_log_member(env, struct_type, member, + "Member is not byte aligned"); + return -EINVAL; + } + + struct_size = struct_type->size; + bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); + if (struct_size - bytes_offset < sizeof(int)) { + btf_verifier_log_member(env, struct_type, member, + "Member exceeds struct_size"); + return -EINVAL; + } + + return 0; +} + +static s32 btf_enum_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_enum *enums = btf_type_enum(t); + struct btf *btf = env->btf; + u16 i, nr_enums; + u32 meta_needed; + + nr_enums = btf_type_vlen(t); + meta_needed = nr_enums * sizeof(*enums); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (t->size != sizeof(int)) { + btf_verifier_log_type(env, t, "Expected size:%zu", + sizeof(int)); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for (i = 0; i < nr_enums; i++) { + if (!btf_name_offset_valid(btf, enums[i].name_off)) { + btf_verifier_log(env, "\tInvalid name_offset:%u", + enums[i].name_off); + return -EINVAL; + } + + btf_verifier_log(env, "\t%s val=%d\n", + btf_name_by_offset(btf, enums[i].name_off), + enums[i].val); + } + + return meta_needed; +} + +static void btf_enum_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_enum *enums = btf_type_enum(t); + u32 i, nr_enums = btf_type_vlen(t); + int v = *(int *)data; + + for (i = 0; i < nr_enums; i++) { + if (v == enums[i].val) { + seq_printf(m, "%s", + btf_name_by_offset(btf, enums[i].name_off)); + return; + } + } + + seq_printf(m, "%d", v); +} + +static struct btf_kind_operations enum_ops = { + .check_meta = btf_enum_check_meta, + .resolve = btf_df_resolve, + .check_member = btf_enum_check_member, + .log_details = btf_enum_log, + .seq_show = btf_enum_seq_show, +}; + +static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { + [BTF_KIND_INT] = &int_ops, + [BTF_KIND_PTR] = &ptr_ops, + [BTF_KIND_ARRAY] = &array_ops, + [BTF_KIND_STRUCT] = &struct_ops, + [BTF_KIND_UNION] = &struct_ops, + [BTF_KIND_ENUM] = &enum_ops, + [BTF_KIND_FWD] = &fwd_ops, + [BTF_KIND_TYPEDEF] = &modifier_ops, + [BTF_KIND_VOLATILE] = &modifier_ops, + [BTF_KIND_CONST] = &modifier_ops, + [BTF_KIND_RESTRICT] = &modifier_ops, +}; + +static s32 btf_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + u32 saved_meta_left = meta_left; + s32 var_meta_size; + + if (meta_left < sizeof(*t)) { + btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", + env->log_type_id, meta_left, sizeof(*t)); + return -EINVAL; + } + meta_left -= sizeof(*t); + + if (t->info & ~BTF_INFO_MASK) { + btf_verifier_log(env, "[%u] Invalid btf_info:%x", + env->log_type_id, t->info); + return -EINVAL; + } + + if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || + BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { + btf_verifier_log(env, "[%u] Invalid kind:%u", + env->log_type_id, BTF_INFO_KIND(t->info)); + return -EINVAL; + } + + if (!btf_name_offset_valid(env->btf, t->name_off)) { + btf_verifier_log(env, "[%u] Invalid name_offset:%u", + env->log_type_id, t->name_off); + return -EINVAL; + } + + var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); + if (var_meta_size < 0) + return var_meta_size; + + meta_left -= var_meta_size; + + return saved_meta_left - meta_left; +} + +static int btf_check_all_metas(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + struct btf_header *hdr; + void *cur, *end; + + hdr = &btf->hdr; + cur = btf->nohdr_data + hdr->type_off; + end = btf->nohdr_data + hdr->type_len; + + env->log_type_id = 1; + while (cur < end) { + struct btf_type *t = cur; + s32 meta_size; + + meta_size = btf_check_meta(env, t, end - cur); + if (meta_size < 0) + return meta_size; + + btf_add_type(env, t); + cur += meta_size; + env->log_type_id++; + } + + return 0; +} + +static int btf_resolve(struct btf_verifier_env *env, + const struct btf_type *t, u32 type_id) +{ + const struct resolve_vertex *v; + int err = 0; + + env->resolve_mode = RESOLVE_TBD; + env_stack_push(env, t, type_id); + while (!err && (v = env_stack_peak(env))) { + env->log_type_id = v->type_id; + err = btf_type_ops(v->t)->resolve(env, v); + } + + env->log_type_id = type_id; + if (err == -E2BIG) + btf_verifier_log_type(env, t, + "Exceeded max resolving depth:%u", + MAX_RESOLVE_DEPTH); + else if (err == -EEXIST) + btf_verifier_log_type(env, t, "Loop detected"); + + return err; +} + +static bool btf_resolve_valid(struct btf_verifier_env *env, + const struct btf_type *t, + u32 type_id) +{ + struct btf *btf = env->btf; + + if (!env_type_is_resolved(env, type_id)) + return false; + + if (btf_type_is_struct(t)) + return !btf->resolved_ids[type_id] && + !btf->resolved_sizes[type_id]; + + if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + t = btf_type_id_resolve(btf, &type_id); + return t && !btf_type_is_modifier(t); + } + + if (btf_type_is_array(t)) { + const struct btf_array *array = btf_type_array(t); + const struct btf_type *elem_type; + u32 elem_type_id = array->type; + u32 elem_size; + + elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); + return elem_type && !btf_type_is_modifier(elem_type) && + (array->nelems * elem_size == + btf->resolved_sizes[type_id]); + } + + return false; +} + +static int btf_check_all_types(struct btf_verifier_env *env) +{ + struct btf *btf = env->btf; + u32 type_id; + int err; + + err = env_resolve_init(env); + if (err) + return err; + + env->phase++; + for (type_id = 1; type_id <= btf->nr_types; type_id++) { + const struct btf_type *t = btf_type_by_id(btf, type_id); + + env->log_type_id = type_id; + if (btf_type_needs_resolve(t) && + !env_type_is_resolved(env, type_id)) { + err = btf_resolve(env, t, type_id); + if (err) + return err; + } + + if (btf_type_needs_resolve(t) && + !btf_resolve_valid(env, t, type_id)) { + btf_verifier_log_type(env, t, "Invalid resolve state"); + return -EINVAL; + } + } + + return 0; +} + +static int btf_parse_type_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr = &env->btf->hdr; + int err; + + /* Type section must align to 4 bytes */ + if (hdr->type_off & (sizeof(u32) - 1)) { + btf_verifier_log(env, "Unaligned type_off"); + return -EINVAL; + } + + if (!hdr->type_len) { + btf_verifier_log(env, "No type found"); + return -EINVAL; + } + + err = btf_check_all_metas(env); + if (err) + return err; + + return btf_check_all_types(env); +} + +static int btf_parse_str_sec(struct btf_verifier_env *env) +{ + const struct btf_header *hdr; + struct btf *btf = env->btf; + const char *start, *end; + + hdr = &btf->hdr; + start = btf->nohdr_data + hdr->str_off; + end = start + hdr->str_len; + + if (end != btf->data + btf->data_size) { + btf_verifier_log(env, "String section is not at the end"); + return -EINVAL; + } + + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + start[0] || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + + btf->strings = start; + + return 0; +} + +static const size_t btf_sec_info_offset[] = { + offsetof(struct btf_header, type_off), + offsetof(struct btf_header, str_off), +}; + +static int btf_sec_info_cmp(const void *a, const void *b) +{ + const struct btf_sec_info *x = a; + const struct btf_sec_info *y = b; + + return (int)(x->off - y->off) ? : (int)(x->len - y->len); +} + +static int btf_check_sec_info(struct btf_verifier_env *env, + u32 btf_data_size) +{ + struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; + u32 total, expected_total, i; + const struct btf_header *hdr; + const struct btf *btf; + + btf = env->btf; + hdr = &btf->hdr; + + /* Populate the secs from hdr */ + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) + secs[i] = *(struct btf_sec_info *)((void *)hdr + + btf_sec_info_offset[i]); + + sort(secs, ARRAY_SIZE(btf_sec_info_offset), + sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); + + /* Check for gaps and overlap among sections */ + total = 0; + expected_total = btf_data_size - hdr->hdr_len; + for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { + if (expected_total < secs[i].off) { + btf_verifier_log(env, "Invalid section offset"); + return -EINVAL; + } + if (total < secs[i].off) { + /* gap */ + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + if (total > secs[i].off) { + btf_verifier_log(env, "Section overlap found"); + return -EINVAL; + } + if (expected_total - total < secs[i].len) { + btf_verifier_log(env, + "Total section length too long"); + return -EINVAL; + } + total += secs[i].len; + } + + /* There is data other than hdr and known sections */ + if (expected_total != total) { + btf_verifier_log(env, "Unsupported section found"); + return -EINVAL; + } + + return 0; +} + +static int btf_parse_hdr(struct btf_verifier_env *env, void __user *btf_data, + u32 btf_data_size) +{ + const struct btf_header *hdr; + u32 hdr_len, hdr_copy; + /* + * Minimal part of the "struct btf_header" that + * contains the hdr_len. + */ + struct btf_min_header { + u16 magic; + u8 version; + u8 flags; + u32 hdr_len; + } __user *min_hdr; + struct btf *btf; + int err; + + btf = env->btf; + min_hdr = btf_data; + + if (btf_data_size < sizeof(*min_hdr)) { + btf_verifier_log(env, "hdr_len not found"); + return -EINVAL; + } + + if (get_user(hdr_len, &min_hdr->hdr_len)) + return -EFAULT; + + if (btf_data_size < hdr_len) { + btf_verifier_log(env, "btf_header not found"); + return -EINVAL; + } + + err = bpf_check_uarg_tail_zero(btf_data, sizeof(btf->hdr), hdr_len); + if (err) { + if (err == -E2BIG) + btf_verifier_log(env, "Unsupported btf_header"); + return err; + } + + hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); + if (copy_from_user(&btf->hdr, btf_data, hdr_copy)) + return -EFAULT; + + hdr = &btf->hdr; + + btf_verifier_log_hdr(env, btf_data_size); + + if (hdr->magic != BTF_MAGIC) { + btf_verifier_log(env, "Invalid magic"); + return -EINVAL; + } + + if (hdr->version != BTF_VERSION) { + btf_verifier_log(env, "Unsupported version"); + return -ENOTSUPP; + } + + if (hdr->flags) { + btf_verifier_log(env, "Unsupported flags"); + return -ENOTSUPP; + } + + if (btf_data_size == hdr->hdr_len) { + btf_verifier_log(env, "No data"); + return -EINVAL; + } + + err = btf_check_sec_info(env, btf_data_size); + if (err) + return err; + + return 0; +} + +static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, + u32 log_level, char __user *log_ubuf, u32 log_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL; + u8 *data; + int err; + + if (btf_data_size > BTF_MAX_SIZE) + return ERR_PTR(-E2BIG); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + if (log_level || log_ubuf || log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log->level = log_level; + log->ubuf = log_ubuf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) { + err = -EINVAL; + goto errout; + } + } + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + err = btf_parse_hdr(env, btf_data, btf_data_size); + if (err) + goto errout; + + data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + if (!data) { + err = -ENOMEM; + goto errout; + } + + btf->data = data; + btf->data_size = btf_data_size; + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + if (copy_from_user(data, btf_data, btf_data_size)) { + err = -EFAULT; + goto errout; + } + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_parse_type_sec(env); + if (err) + goto errout; + + if (log->level && bpf_verifier_log_full(log)) { + err = -ENOSPC; + goto errout; + } + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) + btf_free(btf); + return ERR_PTR(err); +} + +void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m) +{ + const struct btf_type *t = btf_type_by_id(btf, type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); +} + +static int btf_release(struct inode *inode, struct file *filp) +{ + btf_put(filp->private_data); + return 0; +} + +const struct file_operations btf_fops = { + .release = btf_release, +}; + +static int __btf_new_fd(struct btf *btf) +{ + return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); +} + +int btf_new_fd(const union bpf_attr *attr) +{ + struct btf *btf; + int ret; + + btf = btf_parse(u64_to_user_ptr(attr->btf), + attr->btf_size, attr->btf_log_level, + u64_to_user_ptr(attr->btf_log_buf), + attr->btf_log_size); + if (IS_ERR(btf)) + return PTR_ERR(btf); + + ret = btf_alloc_id(btf); + if (ret) { + btf_free(btf); + return ret; + } + + /* + * The BTF ID is published to the userspace. + * All BTF free must go through call_rcu() from + * now on (i.e. free by calling btf_put()). + */ + + ret = __btf_new_fd(btf); + if (ret < 0) + btf_put(btf); + + return ret; +} + +struct btf *btf_get_by_fd(int fd) +{ + struct btf *btf; + struct fd f; + + f = fdget(fd); + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &btf_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + btf = f.file->private_data; + refcount_inc(&btf->refcnt); + fdput(f); + + return btf; +} + +int btf_get_info_by_fd(const struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_btf_info __user *uinfo; + struct bpf_btf_info info = {}; + u32 info_copy, btf_copy; + void __user *ubtf; + u32 uinfo_len; + + uinfo = u64_to_user_ptr(attr->info.info); + uinfo_len = attr->info.info_len; + + info_copy = min_t(u32, uinfo_len, sizeof(info)); + if (copy_from_user(&info, uinfo, info_copy)) + return -EFAULT; + + info.id = btf->id; + ubtf = u64_to_user_ptr(info.btf); + btf_copy = min_t(u32, btf->data_size, info.btf_size); + if (copy_to_user(ubtf, btf->data, btf_copy)) + return -EFAULT; + info.btf_size = btf->data_size; + + if (copy_to_user(uinfo, &info, info_copy) || + put_user(info_copy, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + +int btf_get_fd_by_id(u32 id) +{ + struct btf *btf; + int fd; + + rcu_read_lock(); + btf = idr_find(&btf_idr, id); + if (!btf || !refcount_inc_not_zero(&btf->refcnt)) + btf = ERR_PTR(-ENOENT); + rcu_read_unlock(); + + if (IS_ERR(btf)) + return PTR_ERR(btf); + + fd = __btf_new_fd(btf); + if (fd < 0) + btf_put(btf); + + return fd; +} + +u32 btf_id(const struct btf *btf) +{ + return btf->id; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 43171a0bb02b..f7c00bd6f8e4 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -500,6 +500,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted + * @t_ctx: Pointer to attach type specific context * * socket is expected to be of type INET or INET6. * @@ -508,12 +509,15 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); */ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type) + enum bpf_attach_type type, + void *t_ctx) { struct bpf_sock_addr_kern ctx = { .sk = sk, .uaddr = uaddr, + .t_ctx = t_ctx, }; + struct sockaddr_storage unspec; struct cgroup *cgrp; int ret; @@ -523,6 +527,11 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; + if (!ctx.uaddr) { + memset(&unspec, 0, sizeof(unspec)); + ctx.uaddr = (struct sockaddr *)&unspec; + } + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6ef6746a7871..a9e6c04d0f4a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -31,6 +31,7 @@ #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> +#include <linux/perf_event.h> #include <asm/unaligned.h> @@ -349,6 +350,20 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, return prog_adj; } +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +{ + int i; + + for (i = 0; i < fp->aux->func_cnt; i++) + bpf_prog_kallsyms_del(fp->aux->func[i]); +} + +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) +{ + bpf_prog_kallsyms_del_subprogs(fp); + bpf_prog_kallsyms_del(fp); +} + #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); @@ -583,6 +598,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; + hdr->locked = 0; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -683,23 +700,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -940,14 +940,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -957,6 +950,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -987,8 +987,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1315,67 +1313,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here @@ -1513,6 +1450,33 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } +static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) +{ +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + int i, err; + + for (i = 0; i < fp->aux->func_cnt; i++) { + err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); + if (err) + return err; + } + + return bpf_prog_check_pages_ro_single(fp); +#endif + return 0; +} + +static void bpf_prog_select_func(struct bpf_prog *fp) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0_warn; +#endif +} + /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -1523,13 +1487,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + /* In case of BPF to BPF calls, verifier did all the prep + * work with regards to JITing, etc. + */ + if (fp->bpf_func) + goto finalize; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; -#else - fp->bpf_func = __bpf_prog_ret0_warn; -#endif + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1550,6 +1514,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; } + +finalize: bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1558,7 +1524,17 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - + if (*err) + return fp; + + /* Checkpoint: at this point onwards any cBPF -> eBPF or + * native eBPF program is read-only. If we failed to change + * the page attributes (e.g. allocation failure from + * splitting large pages), then reject the whole program + * in order to guarantee not ending up with any W+X pages + * from BPF side in kernel. + */ + *err = bpf_prog_check_pages_ro_locked(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); @@ -1695,6 +1671,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog **existing_prog; struct bpf_prog_array *array; + bool found_exclude = false; int new_prog_idx = 0; /* Figure out how many existing progs we need to carry over to @@ -1703,14 +1680,20 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, if (old_array) { existing_prog = old_array->progs; for (; *existing_prog; existing_prog++) { - if (*existing_prog != exclude_prog && - *existing_prog != &dummy_bpf_prog.prog) + if (*existing_prog == exclude_prog) { + found_exclude = true; + continue; + } + if (*existing_prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (*existing_prog == include_prog) return -EEXIST; } } + if (exclude_prog && !found_exclude) + return -ENOENT; + /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) @@ -1772,6 +1755,10 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); +#ifdef CONFIG_PERF_EVENTS + if (aux->prog->has_callchain_buf) + put_callchain_buffers(); +#endif for (i = 0; i < aux->func_cnt; i++) bpf_jit_free(aux->func[i]); if (aux->func_cnt) { @@ -1832,6 +1819,8 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_sock_map_update_proto __weak; +const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { @@ -1844,6 +1833,7 @@ bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, { return -ENOTSUPP; } +EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { @@ -1890,9 +1880,3 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); - -/* These are only used within the BPF_SYSCALL code */ -#ifdef CONFIG_BPF_SYSCALL -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); -EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); -#endif diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..e0918d180f08 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ptr_ring.h> +#include <net/xdp.h> #include <linux/sched.h> #include <linux/workqueue.h> @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_frame *xdpf; + + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdpf); continue; } @@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame_rx_napi(xdpf); } processed++; } @@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 565f9ece9115..642c97f6d1b8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,15 +48,25 @@ * calls will fail at this point. */ #include <linux/bpf.h> +#include <net/xdp.h> #include <linux/filter.h> +#include <trace/events/xdp.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +#define DEV_MAP_BULK_SIZE 16 +struct xdp_bulk_queue { + struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct net_device *dev_rx; + unsigned int count; +}; + struct bpf_dtab_netdev { - struct net_device *dev; + struct net_device *dev; /* must be first member, due to tracepoint */ struct bpf_dtab *dtab; unsigned int bit; + struct xdp_bulk_queue __percpu *bulkq; struct rcu_head rcu; }; @@ -206,6 +216,50 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) __set_bit(bit, bitmap); } +static int bq_xmit_all(struct bpf_dtab_netdev *obj, + struct xdp_bulk_queue *bq, u32 flags) +{ + struct net_device *dev = obj->dev; + int sent = 0, drops = 0, err = 0; + int i; + + if (unlikely(!bq->count)) + return 0; + + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + prefetch(xdpf); + } + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags); + if (sent < 0) { + err = sent; + sent = 0; + goto error; + } + drops = bq->count - sent; +out: + bq->count = 0; + + trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, + sent, drops, bq->dev_rx, dev, err); + bq->dev_rx = NULL; + return 0; +error: + /* If ndo_xdp_xmit fails with an errno, no frames have been + * xmit'ed and it's our responsibility to them free all. + */ + for (i = 0; i < bq->count; i++) { + struct xdp_frame *xdpf = bq->q[i]; + + /* RX path under NAPI protection, can return frames faster */ + xdp_return_frame_rx_napi(xdpf); + drops++; + } + goto out; +} + /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled @@ -221,7 +275,7 @@ void __dev_map_flush(struct bpf_map *map) for_each_set_bit(bit, bitmap, map->max_entries) { struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct net_device *netdev; + struct xdp_bulk_queue *bq; /* This is possible if the dev entry is removed by user space * between xdp redirect and flush op. @@ -230,9 +284,9 @@ void __dev_map_flush(struct bpf_map *map) continue; __clear_bit(bit, bitmap); - netdev = dev->dev; - if (likely(netdev->netdev_ops->ndo_xdp_flush)) - netdev->netdev_ops->ndo_xdp_flush(netdev); + + bq = this_cpu_ptr(dev->bulkq); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } @@ -240,37 +294,93 @@ void __dev_map_flush(struct bpf_map *map) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - struct bpf_dtab_netdev *dev; + struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; - dev = READ_ONCE(dtab->netdev_map[key]); - return dev ? dev->dev : NULL; + obj = READ_ONCE(dtab->netdev_map[key]); + return obj; +} + +/* Runs under RCU-read-side, plus in softirq under NAPI protection. + * Thus, safe percpu variable access. + */ +static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, + struct net_device *dev_rx) + +{ + struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); + + if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) + bq_xmit_all(obj, bq, 0); + + /* Ingress dev_rx will be the same for all xdp_frame's in + * bulk_queue, because bq stored per-CPU and must be flushed + * from net_device drivers NAPI func end. + */ + if (!bq->dev_rx) + bq->dev_rx = dev_rx; + + bq->q[bq->count++] = xdpf; + return 0; +} + +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) +{ + struct net_device *dev = dst->dev; + struct xdp_frame *xdpf; + + if (!dev->netdev_ops->ndo_xdp_xmit) + return -EOPNOTSUPP; + + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + return bq_enqueue(dst, xdpf, dev_rx); +} + +int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + int err; + + err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + if (unlikely(err)) + return err; + skb->dev = dst->dev; + generic_xdp_tx(skb, xdp_prog); + + return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { - struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); + struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); + struct net_device *dev = obj ? obj->dev : NULL; return dev ? &dev->ifindex : NULL; } static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { - if (dev->dev->netdev_ops->ndo_xdp_flush) { - struct net_device *fl = dev->dev; + if (dev->dev->netdev_ops->ndo_xdp_xmit) { + struct xdp_bulk_queue *bq; unsigned long *bitmap; + int cpu; for_each_online_cpu(cpu) { bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); __clear_bit(dev->bit, bitmap); - fl->netdev_ops->ndo_xdp_flush(dev->dev); + bq = per_cpu_ptr(dev->bulkq, cpu); + bq_xmit_all(dev, bq, XDP_XMIT_FLUSH); } } } @@ -281,6 +391,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) dev = container_of(rcu, struct bpf_dtab_netdev, rcu); dev_map_flush_old(dev); + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } @@ -313,6 +424,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct net *net = current->nsproxy->net_ns; + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; @@ -327,13 +439,20 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node); if (!dev) return -ENOMEM; + dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), + sizeof(void *), gfp); + if (!dev->bulkq) { + kfree(dev); + return -ENOMEM; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { + free_percpu(dev->bulkq); kfree(dev); return -EINVAL; } @@ -405,6 +524,9 @@ static struct notifier_block dev_map_notifier = { static int __init dev_map_init(void) { + /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ + BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != + offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b76828f23b49..3ca2198a6d22 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + @@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, const int ret = BPF_REG_0; const int ref_reg = BPF_REG_1; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, offsetof(struct htab_elem, lru_node) + @@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; - *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, offsetof(struct htab_elem, key) + diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 3d24e238221e..73065e2d23c2 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -179,3 +179,18 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE, }; + +#ifdef CONFIG_CGROUPS +BPF_CALL_0(bpf_get_current_cgroup_id) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + + return cgrp->kn->id.id; +} + +const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { + .func = bpf_get_current_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; +#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bf6da59ae0d0..76efe9a183f5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,8 +150,163 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } +struct map_iter { + void *key; + bool done; +}; + +static struct map_iter *map_iter(struct seq_file *m) +{ + return m->private; +} + +static struct bpf_map *seq_file_to_map(struct seq_file *m) +{ + return file_inode(m->file)->i_private; +} + +static void map_iter_free(struct map_iter *iter) +{ + if (iter) { + kfree(iter->key); + kfree(iter); + } +} + +static struct map_iter *map_iter_alloc(struct bpf_map *map) +{ + struct map_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); + if (!iter) + goto error; + + iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); + if (!iter->key) + goto error; + + return iter; + +error: + map_iter_free(iter); + return NULL; +} + +static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (map_iter(m)->done) + return NULL; + + if (unlikely(v == SEQ_START_TOKEN)) + goto done; + + if (map->ops->map_get_next_key(map, key, key)) { + map_iter(m)->done = true; + return NULL; + } + +done: + ++(*pos); + return key; +} + +static void *map_seq_start(struct seq_file *m, loff_t *pos) +{ + if (map_iter(m)->done) + return NULL; + + return *pos ? map_iter(m)->key : SEQ_START_TOKEN; +} + +static void map_seq_stop(struct seq_file *m, void *v) +{ +} + +static int map_seq_show(struct seq_file *m, void *v) +{ + struct bpf_map *map = seq_file_to_map(m); + void *key = map_iter(m)->key; + + if (unlikely(v == SEQ_START_TOKEN)) { + seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); + seq_puts(m, "# WARNING!! The output format will change\n"); + } else { + map->ops->map_seq_show_elem(map, key, m); + } + + return 0; +} + +static const struct seq_operations bpffs_map_seq_ops = { + .start = map_seq_start, + .next = map_seq_next, + .show = map_seq_show, + .stop = map_seq_stop, +}; + +static int bpffs_map_open(struct inode *inode, struct file *file) +{ + struct bpf_map *map = inode->i_private; + struct map_iter *iter; + struct seq_file *m; + int err; + + iter = map_iter_alloc(map); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &bpffs_map_seq_ops); + if (err) { + map_iter_free(iter); + return err; + } + + m = file->private_data; + m->private = iter; + + return 0; +} + +static int bpffs_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + map_iter_free(map_iter(m)); + + return seq_release(inode, file); +} + +/* bpffs_map_fops should only implement the basic + * read operation for a BPF map. The purpose is to + * provide a simple user intuitive way to do + * "cat bpffs/pathto/a-pinned-map". + * + * Other operations (e.g. write, lookup...) should be realized by + * the userspace tools (e.g. bpftool) through the + * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update + * interface. + */ +static const struct file_operations bpffs_map_fops = { + .open = bpffs_map_open, + .read = seq_read, + .release = bpffs_map_release, +}; + +static int bpffs_obj_open(struct inode *inode, struct file *file) +{ + return -EIO; +} + +static const struct file_operations bpffs_obj_fops = { + .open = bpffs_obj_open, +}; + static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, - const struct inode_operations *iops) + const struct inode_operations *iops, + const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); @@ -159,6 +314,7 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, return PTR_ERR(inode); inode->i_op = iops; + inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); @@ -167,12 +323,16 @@ static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, + &bpffs_obj_fops); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { - return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); + struct bpf_map *map = arg; + + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, + map->btf ? &bpffs_map_fops : &bpffs_obj_fops); } static struct dentry * @@ -279,13 +439,6 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) ret = bpf_obj_do_pin(pname, raw, type); if (ret != 0) bpf_any_put(raw, type); - if ((trace_bpf_obj_pin_prog_enabled() || - trace_bpf_obj_pin_map_enabled()) && !ret) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_pin_prog(raw, ufd, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_pin_map(raw, ufd, pname); - } out: putname(pname); return ret; @@ -352,15 +505,8 @@ int bpf_obj_get_user(const char __user *pathname, int flags) else goto out; - if (ret < 0) { + if (ret < 0) bpf_any_put(raw, type); - } else if (trace_bpf_obj_get_prog_enabled() || - trace_bpf_obj_get_map_enabled()) { - if (type == BPF_TYPE_PROG) - trace_bpf_obj_get_prog(raw, ret, pname); - if (type == BPF_TYPE_MAP) - trace_bpf_obj_get_map(raw, ret, pname); - } out: putname(pname); return ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b4b5b81e7251..1603492c9cc7 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -623,8 +623,9 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_ATOMIC | __GFP_NOWARN); + node_stack = kmalloc_array(trie->max_prefixlen, + sizeof(struct lpm_trie_node *), + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index c9401075b58c..ac747d5cf7c6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2017 Netronome Systems, Inc. + * Copyright (C) 2017-2018 Netronome Systems, Inc. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this @@ -474,8 +474,10 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux)) return false; + if (!bpf_map_is_dev_bound(map)) + return bpf_map_offload_neutral(map); down_read(&bpf_devs_lock); offload = prog->aux->offload; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 95a84b2f10ce..52a91d816c0e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -48,14 +48,40 @@ #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) -struct bpf_stab { - struct bpf_map map; - struct sock **sock_map; +struct bpf_sock_progs { struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_sock_progs progs; +}; + +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + +struct bpf_htab { + struct bpf_map map; + struct bucket *buckets; + atomic_t count; + u32 n_buckets; + u32 elem_size; + struct bpf_sock_progs progs; +}; + +struct htab_elem { + struct rcu_head rcu; + struct hlist_node hash_node; + u32 hash; + struct sock *sk; + char key[0]; +}; + enum smap_psock_state { SMAP_TX_RUNNING, }; @@ -63,6 +89,8 @@ enum smap_psock_state { struct smap_psock_map_entry { struct list_head list; struct sock **entry; + struct htab_elem *hash_link; + struct bpf_htab *htab; }; struct smap_psock { @@ -191,6 +219,12 @@ out: rcu_read_unlock(); } +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + atomic_dec(&htab->count); + kfree_rcu(l, rcu); +} + static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -227,10 +261,16 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); + if (e->entry) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } else { + hlist_del_rcu(&e->hash_link->hash_node); + smap_release_sock(psock, e->hash_link->sk); + free_htab_elem(e->htab, e->hash_link); } } write_unlock_bh(&sk->sk_callback_lock); @@ -461,7 +501,7 @@ static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) { return ((_rc == SK_PASS) ? - (md->map ? __SK_REDIRECT : __SK_PASS) : + (md->sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP); } @@ -483,6 +523,7 @@ static unsigned int smap_do_tx_msg(struct sock *sk, } bpf_compute_data_pointers_sg(md); + md->sk = sk; rc = (*prog->bpf_func)(md, prog->insnsi); psock->apply_bytes = md->apply_bytes; @@ -1092,7 +1133,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) * when we orphan the skb so that we don't have the possibility * to reference a stale map. */ - TCP_SKB_CB(skb)->bpf.map = NULL; + TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; bpf_compute_data_pointers(skb); preempt_disable(); @@ -1102,7 +1143,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) /* Moving return codes from UAPI namespace into internal namespace */ return rc == SK_PASS ? - (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) : + (TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) : __SK_DROP; } @@ -1372,7 +1413,6 @@ static int smap_init_sock(struct smap_psock *psock, } static void smap_init_progs(struct smap_psock *psock, - struct bpf_stab *stab, struct bpf_prog *verdict, struct bpf_prog *parse) { @@ -1450,14 +1490,13 @@ static void smap_gc_work(struct work_struct *w) kfree(psock); } -static struct smap_psock *smap_init_psock(struct sock *sock, - struct bpf_stab *stab) +static struct smap_psock *smap_init_psock(struct sock *sock, int node) { struct smap_psock *psock; psock = kzalloc_node(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN, - stab->map.numa_node); + node); if (!psock) return ERR_PTR(-ENOMEM); @@ -1525,12 +1564,14 @@ free_stab: return ERR_PTR(err); } -static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +static void smap_list_remove(struct smap_psock *psock, + struct sock **entry, + struct htab_elem *hash_link) { struct smap_psock_map_entry *e, *tmp; list_for_each_entry_safe(e, tmp, &psock->maps, list) { - if (e->entry == entry) { + if (e->entry == entry || e->hash_link == hash_link) { list_del(&e->list); break; } @@ -1568,7 +1609,7 @@ static void sock_map_free(struct bpf_map *map) * to be null and queued for garbage collection. */ if (likely(psock)) { - smap_list_remove(psock, &stab->sock_map[i]); + smap_list_remove(psock, &stab->sock_map[i], NULL); smap_release_sock(psock, sock); } write_unlock_bh(&sock->sk_callback_lock); @@ -1627,7 +1668,7 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (psock->bpf_parse) smap_stop_sock(psock, sock); - smap_list_remove(psock, &stab->sock_map[k]); + smap_list_remove(psock, &stab->sock_map[k], NULL); smap_release_sock(psock, sock); out: write_unlock_bh(&sock->sk_callback_lock); @@ -1662,40 +1703,26 @@ out: * - sock_map must use READ_ONCE and (cmp)xchg operations * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ -static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, - struct bpf_map *map, - void *key, u64 flags) + +static int __sock_map_ctx_update_elem(struct bpf_map *map, + struct bpf_sock_progs *progs, + struct sock *sock, + struct sock **map_link, + void *key) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse, *tx_msg; - struct sock *osock, *sock; + struct smap_psock_map_entry *e = NULL; struct smap_psock *psock; - u32 i = *(u32 *)key; bool new = false; - int err; - - if (unlikely(flags > BPF_EXIST)) - return -EINVAL; - - if (unlikely(i >= stab->map.max_entries)) - return -E2BIG; - - sock = READ_ONCE(stab->sock_map[i]); - if (flags == BPF_EXIST && !sock) - return -ENOENT; - else if (flags == BPF_NOEXIST && sock) - return -EEXIST; - - sock = skops->sk; + int err = 0; /* 1. If sock map has BPF programs those will be inherited by the * sock being added. If the sock is already attached to BPF programs * this results in an error. */ - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); - tx_msg = READ_ONCE(stab->bpf_tx_msg); + verdict = READ_ONCE(progs->bpf_verdict); + parse = READ_ONCE(progs->bpf_parse); + tx_msg = READ_ONCE(progs->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -1748,7 +1775,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } } else { - psock = smap_init_psock(sock, stab); + psock = smap_init_psock(sock, map->numa_node); if (IS_ERR(psock)) { err = PTR_ERR(psock); goto out_progs; @@ -1758,12 +1785,13 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, new = true; } - e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); - if (!e) { - err = -ENOMEM; - goto out_progs; + if (map_link) { + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } } - e->entry = &stab->sock_map[i]; /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. @@ -1780,7 +1808,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = smap_init_sock(psock, sock); if (err) goto out_free; - smap_init_progs(psock, stab, verdict, parse); + smap_init_progs(psock, verdict, parse); smap_start_sock(psock, sock); } @@ -1789,19 +1817,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * it with. Because we can only have a single set of programs if * old_sock has a strp we can stop it. */ - list_add_tail(&e->list, &psock->maps); - write_unlock_bh(&sock->sk_callback_lock); - - osock = xchg(&stab->sock_map[i], sock); - if (osock) { - struct smap_psock *opsock = smap_psock_sk(osock); - - write_lock_bh(&osock->sk_callback_lock); - smap_list_remove(opsock, &stab->sock_map[i]); - smap_release_sock(opsock, osock); - write_unlock_bh(&osock->sk_callback_lock); + if (map_link) { + e->entry = map_link; + list_add_tail(&e->list, &psock->maps); } - return 0; + write_unlock_bh(&sock->sk_callback_lock); + return err; out_free: smap_release_sock(psock, sock); out_progs: @@ -1816,23 +1837,73 @@ out_progs: return err; } -int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs = &stab->progs; + struct sock *osock, *sock; + u32 i = *(u32 *)key; + int err; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + sock = READ_ONCE(stab->sock_map[i]); + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) + return -EEXIST; + + sock = skops->sk; + err = __sock_map_ctx_update_elem(map, progs, sock, &stab->sock_map[i], + key); + if (err) + goto out; + + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + smap_list_remove(opsock, &stab->sock_map[i], NULL); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } +out: + return err; +} + +int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) +{ + struct bpf_sock_progs *progs; struct bpf_prog *orig; - if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP)) + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } else { return -EINVAL; + } switch (type) { case BPF_SK_MSG_VERDICT: - orig = xchg(&stab->bpf_tx_msg, prog); + orig = xchg(&progs->bpf_tx_msg, prog); break; case BPF_SK_SKB_STREAM_PARSER: - orig = xchg(&stab->bpf_parse, prog); + orig = xchg(&progs->bpf_parse, prog); break; case BPF_SK_SKB_STREAM_VERDICT: - orig = xchg(&stab->bpf_verdict, prog); + orig = xchg(&progs->bpf_verdict, prog); break; default: return -EOPNOTSUPP; @@ -1880,21 +1951,421 @@ static int sock_map_update_elem(struct bpf_map *map, static void sock_map_release(struct bpf_map *map) { - struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_sock_progs *progs; struct bpf_prog *orig; - orig = xchg(&stab->bpf_parse, NULL); + if (map->map_type == BPF_MAP_TYPE_SOCKMAP) { + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + progs = &stab->progs; + } else { + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + progs = &htab->progs; + } + + orig = xchg(&progs->bpf_parse, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_verdict, NULL); + orig = xchg(&progs->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); - orig = xchg(&stab->bpf_tx_msg, NULL); + orig = xchg(&progs->bpf_tx_msg, NULL); if (orig) bpf_prog_put(orig); } +static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int i, err; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->value_size != 4 || + attr->map_flags & ~SOCK_CREATE_FLAG_MASK) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&htab->map, attr); + + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + err = -EINVAL; + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) + goto free_htab; + + err = -ENOMEM; + htab->buckets = bpf_map_area_alloc( + htab->n_buckets * sizeof(struct bucket), + htab->map.numa_node); + if (!htab->buckets) + goto free_htab; + + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } + + return &htab->map; +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + +static void sock_hash_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get + * the sk_callback_lock before this case but after xchg + * causing the refcnt to hit zero and sock user data + * (psock) to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + kfree(l); + } + } + rcu_read_unlock(); + bpf_map_area_free(htab->buckets); + kfree(htab); +} + +static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab, + void *key, u32 key_size, u32 hash, + struct sock *sk, + struct htab_elem *old_elem) +{ + struct htab_elem *l_new; + + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + if (!old_elem) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + } + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); + if (!l_new) + return ERR_PTR(-ENOMEM); + + memcpy(l_new->key, key, key_size); + l_new->sk = sk; + l_new->hash = hash; + return l_new; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, + u32 hash, void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) { + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + } + + return NULL; +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static int sock_hash_get_next_key(struct bpf_map *map, + void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l, *next_l; + struct hlist_head *h; + u32 hash, key_size; + int i = 0; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); + h = select_bucket(htab, hash); + + l = lookup_elem_raw(h, hash, key, key_size); + if (!l) + goto find_first_elem; + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + if (next_l) { + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + h = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(h)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* iterated over all buckets and all elements */ + return -ENOENT; +} + +static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct bpf_sock_progs *progs = &htab->progs; + struct htab_elem *l_new = NULL, *l_old; + struct smap_psock_map_entry *e = NULL; + struct hlist_head *head; + struct smap_psock *psock; + u32 key_size, hash; + struct sock *sock; + struct bucket *b; + int err; + + sock = skops->sk; + + if (sock->sk_type != SOCK_STREAM || + sock->sk_protocol != IPPROTO_TCP) + return -EOPNOTSUPP; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) + return -ENOMEM; + + WARN_ON_ONCE(!rcu_read_lock_held()); + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + err = __sock_map_ctx_update_elem(map, progs, sock, NULL, key); + if (err) + goto err; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_bh(&b->lock); + l_old = lookup_elem_raw(head, hash, key, key_size); + if (l_old && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto bucket_err; + } + if (!l_old && map_flags == BPF_EXIST) { + err = -ENOENT; + goto bucket_err; + } + + l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old); + if (IS_ERR(l_new)) { + err = PTR_ERR(l_new); + goto bucket_err; + } + + psock = smap_psock_sk(sock); + if (unlikely(!psock)) { + err = -EINVAL; + goto bucket_err; + } + + e->hash_link = l_new; + e->htab = container_of(map, struct bpf_htab, map); + list_add_tail(&e->list, &psock->maps); + + /* add new element to the head of the list, so that + * concurrent search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + psock = smap_psock_sk(l_old->sk); + + hlist_del_rcu(&l_old->hash_node); + smap_list_remove(psock, NULL, l_old); + smap_release_sock(psock, l_old->sk); + free_htab_elem(htab, l_old); + } + raw_spin_unlock_bh(&b->lock); + return 0; +bucket_err: + raw_spin_unlock_bh(&b->lock); +err: + kfree(e); + psock = smap_psock_sk(sock); + if (psock) + smap_release_sock(psock, sock); + return err; +} + +static int sock_hash_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_hash_ctx_update_elem(&skops, map, key, flags); + fput(socket->file); + return err; +} + +static int sock_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct bucket *b; + struct htab_elem *l; + u32 hash, key_size; + int ret = -ENOENT; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + if (l) { + struct sock *sock = l->sk; + struct smap_psock *psock; + + hlist_del_rcu(&l->hash_node); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, NULL, l); + smap_release_sock(psock, sock); + } + write_unlock_bh(&sock->sk_callback_lock); + free_htab_elem(htab, l); + ret = 0; + } + raw_spin_unlock_bh(&b->lock); + return ret; +} + +struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 key_size, hash; + struct bucket *b; + struct sock *sk; + + key_size = map->key_size; + hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; + + raw_spin_lock_bh(&b->lock); + l = lookup_elem_raw(head, hash, key, key_size); + sk = l ? l->sk : NULL; + raw_spin_unlock_bh(&b->lock); + return sk; +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -1905,6 +2376,15 @@ const struct bpf_map_ops sock_map_ops = { .map_release_uref = sock_map_release, }; +const struct bpf_map_ops sock_hash_ops = { + .map_alloc = sock_hash_alloc, + .map_free = sock_hash_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_hash_get_next_key, + .map_update_elem = sock_hash_update_elem, + .map_delete_elem = sock_hash_delete_elem, +}; + BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { @@ -1922,3 +2402,21 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); +} + +const struct bpf_func_proto bpf_sock_hash_update_proto = { + .func = bpf_sock_hash_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 57eeb1234b67..b675a3f3d141 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -11,6 +11,7 @@ #include <linux/perf_event.h> #include <linux/elf.h> #include <linux/pagemap.h> +#include <linux/irq_work.h> #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -32,6 +33,23 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; +/* irq_work to run up_read() for build_id lookup in nmi context */ +struct stack_map_irq_work { + struct irq_work irq_work; + struct rw_semaphore *sem; +}; + +static void do_up_read(struct irq_work *entry) +{ + struct stack_map_irq_work *work; + + work = container_of(entry, struct stack_map_irq_work, irq_work); + up_read(work->sem); + work->sem = NULL; +} + +static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); + static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -262,27 +280,31 @@ out: return ret; } -static void stack_map_get_build_id_offset(struct bpf_map *map, - struct stack_map_bucket *bucket, +static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; - struct bpf_stack_build_id *id_offs; - - bucket->nr = trace_nr; - id_offs = (struct bpf_stack_build_id *)bucket->data; + bool irq_work_busy = false; + struct stack_map_irq_work *work = NULL; + + if (in_nmi()) { + work = this_cpu_ptr(&up_read_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } /* - * We cannot do up_read() in nmi context, so build_id lookup is - * only supported for non-nmi events. If at some point, it is - * possible to run find_vma() without taking the semaphore, we - * would like to allow build_id lookup in nmi context. + * We cannot do up_read() in nmi context. To do build_id lookup + * in nmi context, we need to run up_read() in irq_work. We use + * a percpu variable to do the irq_work. If the irq_work is + * already used by another lookup, we fall back to report ips. * * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ - if (!user || !current || !current->mm || in_nmi() || + if (!user || !current || !current->mm || irq_work_busy || down_read_trylock(¤t->mm->mmap_sem) == 0) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { @@ -304,7 +326,13 @@ static void stack_map_get_build_id_offset(struct bpf_map *map, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - up_read(¤t->mm->mmap_sem); + + if (!work) { + up_read(¤t->mm->mmap_sem); + } else { + work->sem = ¤t->mm->mmap_sem; + irq_work_queue(&work->irq_work); + } } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, @@ -361,8 +389,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; - stack_map_get_build_id_offset(map, new_bucket, ips, - trace_nr, user); + new_bucket->nr = trace_nr; + stack_map_get_build_id_offset( + (struct bpf_stack_build_id *)new_bucket->data, + ips, trace_nr, user); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { @@ -405,6 +435,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + bool user_build_id = flags & BPF_F_USER_BUILD_ID; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + int err = -EINVAL; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_USER_BUILD_ID))) + goto clear; + if (kernel && user_build_id) + goto clear; + + elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) + : sizeof(u64); + if (unlikely(size % elem_size)) + goto clear; + + num_elem = size / elem_size; + if (sysctl_perf_event_max_stack < num_elem) + init_nr = 0; + else + init_nr = sysctl_perf_event_max_stack - num_elem; + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + if (unlikely(!trace)) + goto err_fault; + + trace_nr = trace->nr - init_nr; + if (trace_nr < skip) + goto err_fault; + + trace_nr -= skip; + trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; + copy_len = trace_nr * elem_size; + ips = trace->ip + skip + init_nr; + if (user && user_build_id) + stack_map_get_build_id_offset(buf, ips, trace_nr, user); + else + memcpy(buf, ips, copy_len); + + if (size > copy_len) + memset(buf + copy_len, 0, size - copy_len); + return copy_len; + +err_fault: + err = -EFAULT; +clear: + memset(buf, 0, size); + return err; +} + +const struct bpf_func_proto bpf_get_stack_proto = { + .func = bpf_get_stack, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { @@ -511,3 +608,16 @@ const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; + +static int __init stack_map_init(void) +{ + int cpu; + struct stack_map_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&up_read_work, cpu); + init_irq_work(&work->irq_work, do_up_read); + } + return 0; +} +subsys_initcall(stack_map_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 016ef9025827..35dc466641f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -11,13 +11,17 @@ */ #include <linux/bpf.h> #include <linux/bpf_trace.h> +#include <linux/bpf_lirc.h> +#include <linux/btf.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/vmalloc.h> #include <linux/mmzone.h> #include <linux/anon_inodes.h> +#include <linux/fdtable.h> #include <linux/file.h> +#include <linux/fs.h> #include <linux/license.h> #include <linux/filter.h> #include <linux/version.h> @@ -26,6 +30,7 @@ #include <linux/cred.h> #include <linux/timekeeping.h> #include <linux/ctype.h> +#include <linux/btf.h> #include <linux/nospec.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ @@ -63,9 +68,9 @@ static const struct bpf_map_ops * const bpf_map_types[] = { * copy_from_user() call. However, this is not a concern since this function is * meant to be a future-proofing of bits. */ -static int check_uarg_tail_zero(void __user *uaddr, - size_t expected_size, - size_t actual_size) +int bpf_check_uarg_tail_zero(void __user *uaddr, + size_t expected_size, + size_t actual_size) { unsigned char __user *addr; unsigned char __user *end; @@ -273,6 +278,7 @@ static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) if (atomic_dec_and_test(&map->refcnt)) { /* bpf_map_free_id() must be called first */ bpf_map_free_id(map, do_idr_lock); + btf_put(map->btf); INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } @@ -282,6 +288,7 @@ void bpf_map_put(struct bpf_map *map) { __bpf_map_put(map, true); } +EXPORT_SYMBOL_GPL(bpf_map_put); void bpf_map_put_with_uref(struct bpf_map *map) { @@ -320,13 +327,15 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "map_id:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT); + map->pages * 1ULL << PAGE_SHIFT, + map->id); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -418,7 +427,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_ifindex +#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -452,6 +461,33 @@ static int map_create(union bpf_attr *attr) atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); + if (bpf_map_support_seq_show(map) && + (attr->btf_key_type_id || attr->btf_value_type_id)) { + struct btf *btf; + + if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + err = -EINVAL; + goto free_map_nouncharge; + } + + btf = btf_get_by_fd(attr->btf_fd); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + goto free_map_nouncharge; + } + + err = map->ops->map_check_btf(map, btf, attr->btf_key_type_id, + attr->btf_value_type_id); + if (err) { + btf_put(btf); + goto free_map_nouncharge; + } + + map->btf = btf; + map->btf_key_type_id = attr->btf_key_type_id; + map->btf_value_type_id = attr->btf_value_type_id; + } + err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge; @@ -476,7 +512,6 @@ static int map_create(union bpf_attr *attr) return err; } - trace_bpf_map_create(map, err); return err; free_map: @@ -484,6 +519,7 @@ free_map: free_map_sec: security_bpf_map_free(map); free_map_nouncharge: + btf_put(map->btf); map->ops->map_free(map); return err; } @@ -516,6 +552,7 @@ struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref) atomic_inc(&map->usercnt); return map; } +EXPORT_SYMBOL_GPL(bpf_map_inc); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -635,7 +672,6 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; - trace_bpf_map_lookup_elem(map, ufd, key, value); err = 0; free_value: @@ -732,8 +768,6 @@ static int map_update_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_update_elem(map, ufd, key, value); free_value: kfree(value); free_key: @@ -786,8 +820,6 @@ static int map_delete_elem(union bpf_attr *attr) __this_cpu_dec(bpf_prog_active); preempt_enable(); out: - if (!err) - trace_bpf_map_delete_elem(map, ufd, key); kfree(key); err_put: fdput(f); @@ -851,7 +883,6 @@ out: if (copy_to_user(unext_key, next_key, map->key_size) != 0) goto free_next_key; - trace_bpf_map_next_key(map, ufd, key, next_key); err = 0; free_next_key: @@ -1003,15 +1034,9 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { - int i; - - trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - - for (i = 0; i < prog->aux->func_cnt; i++) - bpf_prog_kallsyms_del(prog->aux->func[i]); - bpf_prog_kallsyms_del(prog); + bpf_prog_kallsyms_del_all(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1042,11 +1067,13 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" - "memlock:\t%llu\n", + "memlock:\t%llu\n" + "prog_id:\t%u\n", prog->type, prog->jited, prog_tag, - prog->pages * 1ULL << PAGE_SHIFT); + prog->pages * 1ULL << PAGE_SHIFT, + prog->aux->id); } #endif @@ -1172,11 +1199,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; + return __bpf_prog_get(ufd, &type, attach_drv); } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); @@ -1226,6 +1249,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: return 0; default: return -EINVAL; @@ -1328,9 +1353,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* eBPF program is ready to be JITed */ - if (!prog->bpf_func) - prog = bpf_prog_select_runtime(prog, &err); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; @@ -1351,10 +1374,10 @@ static int bpf_prog_load(union bpf_attr *attr) } bpf_prog_kallsyms_add(prog); - trace_bpf_prog_load(prog, err); return err; free_used_maps: + bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); @@ -1543,6 +1566,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1556,6 +1581,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); + case BPF_LIRC_MODE2: + return lirc_prog_attach(attr); default: return -EINVAL; } @@ -1613,6 +1640,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1626,6 +1655,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); + case BPF_LIRC_MODE2: + return lirc_prog_detach(attr); default: return -EINVAL; } @@ -1670,9 +1701,13 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; + case BPF_LIRC_MODE2: + return lirc_prog_query(attr, uattr); default: return -EINVAL; } @@ -1879,7 +1914,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, u32 ulen; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1892,6 +1927,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.load_time = prog->aux->load_time; info.created_by_uid = from_kuid_munged(current_user_ns(), prog->aux->user->uid); + info.gpl_compatible = prog->gpl_compatible; memcpy(info.tag, prog->tag, sizeof(prog->tag)); memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); @@ -1912,6 +1948,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; + info.nr_jited_ksyms = 0; goto done; } @@ -1948,18 +1985,93 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, * for offload. */ ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; + if (prog->aux->func_cnt) { + u32 i; + + info.jited_prog_len = 0; + for (i = 0; i < prog->aux->func_cnt; i++) + info.jited_prog_len += prog->aux->func[i]->jited_len; + } else { + info.jited_prog_len = prog->jited_len; + } + if (info.jited_prog_len && ulen) { if (bpf_dump_raw_ok()) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + + /* for multi-function programs, copy the JITed + * instructions for all the functions + */ + if (prog->aux->func_cnt) { + u32 len, free, i; + u8 *img; + + free = ulen; + for (i = 0; i < prog->aux->func_cnt; i++) { + len = prog->aux->func[i]->jited_len; + len = min_t(u32, len, free); + img = (u8 *) prog->aux->func[i]->bpf_func; + if (copy_to_user(uinsns, img, len)) + return -EFAULT; + uinsns += len; + free -= len; + if (!free) + break; + } + } else { + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } } else { info.jited_prog_insns = 0; } } + ulen = info.nr_jited_ksyms; + info.nr_jited_ksyms = prog->aux->func_cnt; + if (info.nr_jited_ksyms && ulen) { + if (bpf_dump_raw_ok()) { + u64 __user *user_ksyms; + ulong ksym_addr; + u32 i; + + /* copy the address of the kernel symbol + * corresponding to each function + */ + ulen = min_t(u32, info.nr_jited_ksyms, ulen); + user_ksyms = u64_to_user_ptr(info.jited_ksyms); + for (i = 0; i < ulen; i++) { + ksym_addr = (ulong) prog->aux->func[i]->bpf_func; + ksym_addr &= PAGE_MASK; + if (put_user((u64) ksym_addr, &user_ksyms[i])) + return -EFAULT; + } + } else { + info.jited_ksyms = 0; + } + } + + ulen = info.nr_jited_func_lens; + info.nr_jited_func_lens = prog->aux->func_cnt; + if (info.nr_jited_func_lens && ulen) { + if (bpf_dump_raw_ok()) { + u32 __user *user_lens; + u32 func_len, i; + + /* copy the JITed image lengths for each function */ + ulen = min_t(u32, info.nr_jited_func_lens, ulen); + user_lens = u64_to_user_ptr(info.jited_func_lens); + for (i = 0; i < ulen; i++) { + func_len = prog->aux->func[i]->jited_len; + if (put_user(func_len, &user_lens[i])) + return -EFAULT; + } + } else { + info.jited_func_lens = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1977,7 +2089,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, u32 info_len = attr->info.info_len; int err; - err = check_uarg_tail_zero(uinfo, sizeof(info), info_len); + err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); if (err) return err; info_len = min_t(u32, sizeof(info), info_len); @@ -1990,6 +2102,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (map->btf) { + info.btf_id = btf_id(map->btf); + info.btf_key_type_id = map->btf_key_type_id; + info.btf_value_type_id = map->btf_value_type_id; + } + if (bpf_map_is_dev_bound(map)) { err = bpf_map_offload_info_fill(&info, map); if (err) @@ -2003,6 +2121,21 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } +static int bpf_btf_get_info_by_fd(struct btf *btf, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); + if (err) + return err; + + return btf_get_info_by_fd(btf, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -2025,6 +2158,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (f.file->f_op == &bpf_map_fops) err = bpf_map_get_info_by_fd(f.file->private_data, attr, uattr); + else if (f.file->f_op == &btf_fops) + err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); else err = -EINVAL; @@ -2032,6 +2167,158 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } +#define BPF_BTF_LOAD_LAST_FIELD btf_log_level + +static int bpf_btf_load(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_LOAD)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_new_fd(attr); +} + +#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id + +static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) +{ + if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btf_get_fd_by_id(attr->btf_id); +} + +static int bpf_task_fd_query_copy(const union bpf_attr *attr, + union bpf_attr __user *uattr, + u32 prog_id, u32 fd_type, + const char *buf, u64 probe_offset, + u64 probe_addr) +{ + char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf); + u32 len = buf ? strlen(buf) : 0, input_len; + int err = 0; + + if (put_user(len, &uattr->task_fd_query.buf_len)) + return -EFAULT; + input_len = attr->task_fd_query.buf_len; + if (input_len && ubuf) { + if (!len) { + /* nothing to copy, just make ubuf NULL terminated */ + char zero = '\0'; + + if (put_user(zero, ubuf)) + return -EFAULT; + } else if (input_len >= len + 1) { + /* ubuf can hold the string with NULL terminator */ + if (copy_to_user(ubuf, buf, len + 1)) + return -EFAULT; + } else { + /* ubuf cannot hold the string with NULL terminator, + * do a partial copy with NULL terminator. + */ + char zero = '\0'; + + err = -ENOSPC; + if (copy_to_user(ubuf, buf, input_len - 1)) + return -EFAULT; + if (put_user(zero, ubuf + input_len - 1)) + return -EFAULT; + } + } + + if (put_user(prog_id, &uattr->task_fd_query.prog_id) || + put_user(fd_type, &uattr->task_fd_query.fd_type) || + put_user(probe_offset, &uattr->task_fd_query.probe_offset) || + put_user(probe_addr, &uattr->task_fd_query.probe_addr)) + return -EFAULT; + + return err; +} + +#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr + +static int bpf_task_fd_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + pid_t pid = attr->task_fd_query.pid; + u32 fd = attr->task_fd_query.fd; + const struct perf_event *event; + struct files_struct *files; + struct task_struct *task; + struct file *file; + int err; + + if (CHECK_ATTR(BPF_TASK_FD_QUERY)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (attr->task_fd_query.flags != 0) + return -EINVAL; + + task = get_pid_task(find_vpid(pid), PIDTYPE_PID); + if (!task) + return -ENOENT; + + files = get_files_struct(task); + put_task_struct(task); + if (!files) + return -ENOENT; + + err = 0; + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + err = -EBADF; + else + get_file(file); + spin_unlock(&files->file_lock); + put_files_struct(files); + + if (err) + goto out; + + if (file->f_op == &bpf_raw_tp_fops) { + struct bpf_raw_tracepoint *raw_tp = file->private_data; + struct bpf_raw_event_map *btp = raw_tp->btp; + + err = bpf_task_fd_query_copy(attr, uattr, + raw_tp->prog->aux->id, + BPF_FD_TYPE_RAW_TRACEPOINT, + btp->tp->name, 0, 0); + goto put_file; + } + + event = perf_get_event(file); + if (!IS_ERR(event)) { + u64 probe_offset, probe_addr; + u32 prog_id, fd_type; + const char *buf; + + err = bpf_get_perf_event_info(event, &prog_id, &fd_type, + &buf, &probe_offset, + &probe_addr); + if (!err) + err = bpf_task_fd_query_copy(attr, uattr, prog_id, + fd_type, buf, + probe_offset, + probe_addr); + goto put_file; + } + + err = -ENOTSUPP; +put_file: + fput(file); +out: + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -2040,7 +2327,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; - err = check_uarg_tail_zero(uattr, sizeof(attr), size); + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; size = min_t(u32, size, sizeof(attr)); @@ -2112,6 +2399,15 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_RAW_TRACEPOINT_OPEN: err = bpf_raw_tracepoint_open(&attr); break; + case BPF_BTF_LOAD: + err = bpf_btf_load(&attr); + break; + case BPF_BTF_GET_FD_BY_ID: + err = bpf_btf_get_fd_by_id(&attr); + break; + case BPF_TASK_FD_QUERY: + err = bpf_task_fd_query(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 1f4bf68c12db..938d41211be7 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -43,6 +43,16 @@ struct tnum tnum_rshift(struct tnum a, u8 shift) return TNUM(a.value >> shift, a.mask >> shift); } +struct tnum tnum_arshift(struct tnum a, u8 min_shift) +{ + /* if a.value is negative, arithmetic shifting by minimum shift + * will have larger negative offset compared to more shifting. + * If a.value is nonnegative, arithmetic shifting by minimum shift + * will have larger positive offset compare to more shifting. + */ + return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); +} + struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1904e814f282..9e2bf834f13a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22,6 +22,7 @@ #include <linux/stringify.h> #include <linux/bsearch.h> #include <linux/sort.h> +#include <linux/perf_event.h> #include "disasm.h" @@ -186,6 +187,8 @@ struct bpf_call_arg_meta { bool pkt_access; int regno; int access_size; + s64 msize_smax_value; + u64 msize_umax_value; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -760,18 +763,19 @@ enum reg_arg_type { static int cmp_subprogs(const void *a, const void *b) { - return *(int *)a - *(int *)b; + return ((struct bpf_subprog_info *)a)->start - + ((struct bpf_subprog_info *)b)->start; } static int find_subprog(struct bpf_verifier_env *env, int off) { - u32 *p; + struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs); + p = bsearch(&off, env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs); if (!p) return -ENOENT; - return p - env->subprog_starts; + return p - env->subprog_info; } @@ -791,18 +795,24 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } - env->subprog_starts[env->subprog_cnt++] = off; - sort(env->subprog_starts, env->subprog_cnt, - sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + env->subprog_info[env->subprog_cnt++].start = off; + sort(env->subprog_info, env->subprog_cnt, + sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return 0; } static int check_subprogs(struct bpf_verifier_env *env) { int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; int insn_cnt = env->prog->len; + /* Add entry function. */ + ret = add_subprog(env, 0); + if (ret < 0) + return ret; + /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) @@ -822,16 +832,18 @@ static int check_subprogs(struct bpf_verifier_env *env) return ret; } + /* Add a fake 'exit' subprog which could simplify subprog iteration + * logic. 'subprog_cnt' should not be increased. + */ + subprog[env->subprog_cnt].start = insn_cnt; + if (env->log.level > 1) for (i = 0; i < env->subprog_cnt; i++) - verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + verbose(env, "func#%d @%d\n", i, subprog[i].start); /* now check that all jumps are within the same subprog */ - subprog_start = 0; - if (env->subprog_cnt == cur_subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[cur_subprog++]; + subprog_start = subprog[cur_subprog].start; + subprog_end = subprog[cur_subprog + 1].start; for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; @@ -856,10 +868,9 @@ next: return -EINVAL; } subprog_start = subprog_end; - if (env->subprog_cnt == cur_subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[cur_subprog++]; + cur_subprog++; + if (cur_subprog < env->subprog_cnt) + subprog_end = subprog[cur_subprog + 1].start; } } return 0; @@ -1298,6 +1309,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, switch (env->prog->type) { case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: /* dst_input() and dst_output() can't write for now */ if (t == BPF_WRITE) return false; @@ -1517,13 +1529,13 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno]; + u16 stack = env->subprog_info[func->subprogno].stack_depth; if (stack >= -off) return 0; /* update known max for given subprogram */ - env->subprog_stack_depth[func->subprogno] = -off; + env->subprog_info[func->subprogno].stack_depth = -off; return 0; } @@ -1535,9 +1547,9 @@ static int update_stack_depth(struct bpf_verifier_env *env, */ static int check_max_stack_depth(struct bpf_verifier_env *env) { - int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; + struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; @@ -1545,17 +1557,14 @@ process_func: /* round up to 32-bytes, since this is granularity * of interpreter stack size */ - depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", frame + 1, depth); return -EACCES; } continue_func: - if (env->subprog_cnt == subprog) - subprog_end = insn_cnt; - else - subprog_end = env->subprog_starts[subprog]; + subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { if (insn[i].code != (BPF_JMP | BPF_CALL)) continue; @@ -1563,17 +1572,16 @@ continue_func: continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; - ret_prog[frame] = subprog; + ret_prog[frame] = idx; /* find the callee */ i = i + insn[i].imm + 1; - subprog = find_subprog(env, i); - if (subprog < 0) { + idx = find_subprog(env, i); + if (idx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", i); return -EFAULT; } - subprog++; frame++; if (frame >= MAX_CALL_FRAMES) { WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); @@ -1586,10 +1594,10 @@ continue_func: */ if (frame == 0) return 0; - depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); frame--; i = ret_insn[frame]; - subprog = ret_prog[frame]; + idx = ret_prog[frame]; goto continue_func; } @@ -1605,11 +1613,34 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, start); return -EFAULT; } - subprog++; - return env->subprog_stack_depth[subprog]; + return env->subprog_info[subprog].stack_depth; } #endif +static int check_ctx_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + /* Access to ctx or passing it to a helper is only allowed in + * its original, unmodified form. + */ + + if (reg->off) { + verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", + regno, reg->off); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + return -EACCES; + } + + return 0; +} + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1679,24 +1710,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } - /* ctx accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - */ - if (reg->off) { - verbose(env, - "dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n", - regno, reg->off, off - reg->off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, - "variable ctx access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; + err = check_ctx_access(env, insn_idx, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { /* ctx access returns either a scalar, or a @@ -1961,7 +1979,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && + if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || @@ -1977,6 +1995,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; + err = check_ctx_reg(env, reg, regno); + if (err < 0) + return err; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2013,14 +2034,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->key_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->key_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->key_size, false, + NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2030,17 +2046,18 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } - if (type_is_pkt_pointer(type)) - err = check_packet_access(env, regno, reg->off, - meta->map_ptr->value_size, - false); - else - err = check_stack_boundary(env, regno, - meta->map_ptr->value_size, - false, NULL); + err = check_helper_mem_access(env, regno, + meta->map_ptr->value_size, false, + NULL); } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); + /* remember the mem_size which may be used later + * to refine return values. + */ + meta->msize_smax_value = reg->smax_value; + meta->msize_umax_value = reg->umax_value; + /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ @@ -2118,8 +2135,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_redirect_map) goto error; break; - /* Restrict bpf side of cpumap, open when use-cases appear */ + /* Restrict bpf side of cpumap and xskmap, open when use-cases + * appear. + */ case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -2135,6 +2155,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_msg_redirect_map) goto error; break; + case BPF_MAP_TYPE_SOCKHASH: + if (func_id != BPF_FUNC_sk_redirect_hash && + func_id != BPF_FUNC_sock_hash_update && + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_hash) + goto error; + break; default: break; } @@ -2144,7 +2171,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; - if (env->subprog_cnt) { + if (env->subprog_cnt > 1) { verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); return -EINVAL; } @@ -2166,16 +2193,20 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, break; case BPF_FUNC_redirect_map: if (map->map_type != BPF_MAP_TYPE_DEVMAP && - map->map_type != BPF_MAP_TYPE_CPUMAP) + map->map_type != BPF_MAP_TYPE_CPUMAP && + map->map_type != BPF_MAP_TYPE_XSKMAP) goto error; break; case BPF_FUNC_sk_redirect_map: case BPF_FUNC_msg_redirect_map: + case BPF_FUNC_sock_map_update: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; - case BPF_FUNC_sock_map_update: - if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + case BPF_FUNC_sk_redirect_hash: + case BPF_FUNC_msg_redirect_hash: + case BPF_FUNC_sock_hash_update: + if (map->map_type != BPF_MAP_TYPE_SOCKHASH) goto error; break; default: @@ -2316,7 +2347,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* remember the callsite, it will be used by bpf_exit */ *insn_idx /* callsite */, state->curframe + 1 /* frameno within this callchain */, - subprog + 1 /* subprog number within this prog */); + subprog /* subprog number within this prog */); /* copy r1 - r5 args that callee can access */ for (i = BPF_REG_1; i <= BPF_REG_5; i++) @@ -2380,6 +2411,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, + int func_id, + struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; + + if (ret_type != RET_INTEGER || + (func_id != BPF_FUNC_get_stack && + func_id != BPF_FUNC_probe_read_str)) + return; + + ret_reg->smax_value = meta->msize_smax_value; + ret_reg->umax_value = meta->msize_umax_value; + __reg_deduce_bounds(ret_reg); + __reg_bound_offset(ret_reg); +} + static int record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) @@ -2387,8 +2435,11 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; if (func_id != BPF_FUNC_tail_call && - func_id != BPF_FUNC_map_lookup_elem) + func_id != BPF_FUNC_map_lookup_elem && + func_id != BPF_FUNC_map_update_elem && + func_id != BPF_FUNC_map_delete_elem) return 0; + if (meta->map_ptr == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; @@ -2428,7 +2479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose(env, "cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n"); return -EINVAL; } @@ -2516,10 +2567,30 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; + if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { + const char *err_str; + +#ifdef CONFIG_PERF_EVENTS + err = get_callchain_buffers(sysctl_perf_event_max_stack); + err_str = "cannot get callchain buffer for func %s#%d\n"; +#else + err = -ENOTSUPP; + err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +#endif + if (err) { + verbose(env, err_str, func_id_name(func_id), func_id); + return err; + } + + env->prog->has_callchain_buf = true; + } + if (changes_data) clear_all_pkt_pointers(env); return 0; @@ -2964,10 +3035,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->umin_value <<= umin_val; dst_reg->umax_value <<= umax_val; } - if (src_known) - dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); - else - dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; @@ -2995,16 +3063,35 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; - if (src_known) - dst_reg->var_off = tnum_rshift(dst_reg->var_off, - umin_val); - else - dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val); + dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); dst_reg->umin_value >>= umax_val; dst_reg->umax_value >>= umin_val; /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg); break; + case BPF_ARSH: + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. + */ + mark_reg_unknown(env, regs, insn->dst_reg); + break; + } + + /* Upon reaching here, src_known is true and + * umax_val is equal to umin_val. + */ + dst_reg->smin_value >>= umin_val; + dst_reg->smax_value >>= umin_val; + dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val); + + /* blow away the dst_reg umin_value/umax_value and rely on + * dst_reg var_off to refine the result. + */ + dst_reg->umin_value = 0; + dst_reg->umax_value = U64_MAX; + __update_reg_bounds(dst_reg); + break; default: mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -3888,7 +3975,12 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } - if (env->subprog_cnt) { + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (env->subprog_cnt > 1) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees * that can have arbitrary arguments. It's problematic @@ -4919,15 +5011,15 @@ process_bpf_exit: verbose(env, "processed %d insns (limit %d), stack depth ", insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt + 1; i++) { - u32 depth = env->subprog_stack_depth[i]; + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt + 1) + if (i + 1 < env->subprog_cnt) verbose(env, "+"); } verbose(env, "\n"); - env->prog->aux->stack_depth = env->subprog_stack_depth[0]; + env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -5051,7 +5143,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_bpf_prog_info() + * and all maps are released in free_used_maps() */ map = bpf_map_inc(map, false); if (IS_ERR(map)) { @@ -5114,7 +5206,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, if (cnt == 1) return 0; - new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + new_data = vzalloc(array_size(prog_len, + sizeof(struct bpf_insn_aux_data))); if (!new_data) return -ENOMEM; memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); @@ -5133,10 +5226,11 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len if (len == 1) return; - for (i = 0; i < env->subprog_cnt; i++) { - if (env->subprog_starts[i] < off) + /* NOTE: fake 'exit' subprog should be updated as well. */ + for (i = 0; i <= env->subprog_cnt; i++) { + if (env->subprog_info[i].start < off) continue; - env->subprog_starts[i] += len - 1; + env->subprog_info[i].start += len - 1; } } @@ -5210,7 +5304,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } } - if (!ops->convert_ctx_access) + if (!ops->convert_ctx_access || bpf_prog_is_dev_bound(env->prog->aux)) return 0; insn = env->prog->insnsi + delta; @@ -5270,6 +5364,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) */ is_narrower_load = size < ctx_field_size; if (is_narrower_load) { + u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); u32 off = insn->off; u8 size_code; @@ -5284,7 +5379,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else if (ctx_field_size == 8) size_code = BPF_DW; - insn->off = off & ~(ctx_field_size - 1); + insn->off = off & ~(size_default - 1); insn->code = BPF_LDX | BPF_MEM | size_code; } @@ -5328,7 +5423,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) void *old_bpf_func; int err = -ENOMEM; - if (env->subprog_cnt == 0) + if (env->subprog_cnt <= 1) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { @@ -5344,7 +5439,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* temporarily remember subprog id inside insn instead of * aux_data, since next loop will split up all insns into funcs */ - insn->off = subprog + 1; + insn->off = subprog; /* remember original imm in case JIT fails and fallback * to interpreter will be needed */ @@ -5353,16 +5448,13 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->imm = 1; } - func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); if (!func) return -ENOMEM; - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { subprog_start = subprog_end; - if (env->subprog_cnt == i) - subprog_end = prog->len; - else - subprog_end = env->subprog_starts[i]; + subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); @@ -5379,7 +5471,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * Long term would need debug info to populate names */ func[i]->aux->name[0] = 'F'; - func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { @@ -5392,20 +5484,33 @@ static int jit_subprogs(struct bpf_verifier_env *env) * now populate all bpf_calls with correct addresses and * run last pass of JIT */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->off = 0; insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) func[subprog]->bpf_func - __bpf_call_base; } + + /* we use the aux data to keep a list of the start addresses + * of the JITed images for each function in the program + * + * for some architectures, such as powerpc64, the imm field + * might not be large enough to hold the offset of the start + * address of the callee's JITed image from __bpf_call_base + * + * in such cases, we can lookup the start address of a callee + * by using its subprog id, available from the off field of + * the call instruction, as an index for this list + */ + func[i]->aux->func = func; + func[i]->aux->func_cnt = env->subprog_cnt; } - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { old_bpf_func = func[i]->bpf_func; tmp = bpf_int_jit_compile(func[i]); if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { @@ -5419,7 +5524,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* finally lock prog and jit images for all functions and * populate kallsysm */ - for (i = 0; i <= env->subprog_cnt; i++) { + for (i = 0; i < env->subprog_cnt; i++) { bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } @@ -5429,26 +5534,21 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - unsigned long addr; - if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); - addr = (unsigned long)func[subprog + 1]->bpf_func; - addr &= PAGE_MASK; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - addr - __bpf_call_base; + insn->imm = subprog; } prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; - prog->aux->func_cnt = env->subprog_cnt + 1; + prog->aux->func_cnt = env->subprog_cnt; return 0; out_free: - for (i = 0; i <= env->subprog_cnt; i++) + for (i = 0; i < env->subprog_cnt; i++) if (func[i]) bpf_jit_free(func[i]); kfree(func); @@ -5505,6 +5605,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + const struct bpf_map_ops *ops; struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; @@ -5552,6 +5653,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) @@ -5615,35 +5735,61 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) } /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup - * handlers are currently limited to 64 bit only. + * and other inlining handlers are currently limited to 64 bit + * only. */ if (prog->jit_requested && BITS_PER_LONG == 64 && - insn->imm == BPF_FUNC_map_lookup_elem) { + (insn->imm == BPF_FUNC_map_lookup_elem || + insn->imm == BPF_FUNC_map_update_elem || + insn->imm == BPF_FUNC_map_delete_elem)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; map_ptr = BPF_MAP_PTR(aux->map_state); - if (!map_ptr->ops->map_gen_lookup) - goto patch_call_imm; + ops = map_ptr->ops; + if (insn->imm == BPF_FUNC_map_lookup_elem && + ops->map_gen_lookup) { + cnt = ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); - if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; - } + new_prog = bpf_patch_insn_data(env, i + delta, + insn_buf, cnt); + if (!new_prog) + return -ENOMEM; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, - cnt); - if (!new_prog) - return -ENOMEM; + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } - delta += cnt - 1; + BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_delete_elem, + (int (*)(struct bpf_map *map, void *key))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_update_elem, + (int (*)(struct bpf_map *map, void *key, void *value, + u64 flags))NULL)); + switch (insn->imm) { + case BPF_FUNC_map_lookup_elem: + insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_update_elem: + insn->imm = BPF_CAST_CALL(ops->map_update_elem) - + __bpf_call_base; + continue; + case BPF_FUNC_map_delete_elem: + insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - + __bpf_call_base; + continue; + } - /* keep walking new program and skip insns we just inserted */ - env->prog = prog = new_prog; - insn = new_prog->insnsi + i + delta; - continue; + goto patch_call_imm; } if (insn->imm == BPF_FUNC_redirect_map) { @@ -5725,8 +5871,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) return -ENOMEM; log = &env->log; - env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * - (*prog)->len); + env->insn_aux_data = + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), + (*prog)->len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; @@ -5755,16 +5902,16 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) - goto err_unlock; + goto skip_full_check; } - ret = replace_map_fd_with_map_ptr(env); - if (ret < 0) - goto skip_full_check; - env->explored_states = kcalloc(env->prog->len, sizeof(struct bpf_verifier_state_list *), GFP_USER); @@ -5835,7 +5982,7 @@ skip_full_check: err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release - * them now. Otherwise free_bpf_prog_info() will release them. + * them now. Otherwise free_used_maps() will release them. */ release_maps(env); *prog = env->prog; diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c new file mode 100644 index 000000000000..b3c557476a8d --- /dev/null +++ b/kernel/bpf/xskmap.c @@ -0,0 +1,232 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/bpf.h> +#include <linux/capability.h> +#include <net/xdp_sock.h> +#include <linux/slab.h> +#include <linux/sched.h> + +struct xsk_map { + struct bpf_map map; + struct xdp_sock **xsk_map; + struct list_head __percpu *flush_list; +}; + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + int cpu, err = -EINVAL; + struct xsk_map *m; + u64 cost; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + m = kzalloc(sizeof(*m), GFP_USER); + if (!m) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&m->map, attr); + + cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); + cost += sizeof(struct list_head) * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_m; + + m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* Notice returns -EPERM on if map size is larger than memlock limit */ + err = bpf_map_precharge_memlock(m->map.pages); + if (err) + goto free_m; + + err = -ENOMEM; + + m->flush_list = alloc_percpu(struct list_head); + if (!m->flush_list) + goto free_m; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); + + m->xsk_map = bpf_map_area_alloc(m->map.max_entries * + sizeof(struct xdp_sock *), + m->map.numa_node); + if (!m->xsk_map) + goto free_percpu; + return &m->map; + +free_percpu: + free_percpu(m->flush_list); +free_m: + kfree(m); + return ERR_PTR(err); +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + int i; + + synchronize_net(); + + for (i = 0; i < map->max_entries; i++) { + struct xdp_sock *xs; + + xs = m->xsk_map[i]; + if (!xs) + continue; + + sock_put((struct sock *)xs); + } + + free_percpu(m->flush_list); + bpf_map_area_free(m->xsk_map); + kfree(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs; + + if (key >= map->max_entries) + return NULL; + + xs = READ_ONCE(m->xsk_map[key]); + return xs; +} + +int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, + struct xdp_sock *xs) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + int err; + + err = xsk_rcv(xs, xdp); + if (err) + return err; + + if (!xs->flush_node.prev) + list_add(&xs->flush_node, flush_list); + + return 0; +} + +void __xsk_map_flush(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct list_head *flush_list = this_cpu_ptr(m->flush_list); + struct xdp_sock *xs, *tmp; + + list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { + xsk_flush(xs); + __list_del(xs->flush_node.prev, xs->flush_node.next); + xs->flush_node.prev = NULL; + } +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xdp_sock *xs, *old_xs; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + sock_hold(sock->sk); + + old_xs = xchg(&m->xsk_map[i], xs); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + sockfd_put(sock); + return 0; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + old_xs = xchg(&m->xsk_map[k], NULL); + if (old_xs) { + /* Make sure we've flushed everything. */ + synchronize_net(); + sock_put((struct sock *)old_xs); + } + + return 0; +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, +}; + + diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 2be89a003185..bfcdae896122 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o stat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 0808a33d16d3..77ff1cd6a252 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -201,13 +201,12 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* - * stat.c + * rstat.c */ -void cgroup_stat_flush(struct cgroup *cgrp); -int cgroup_stat_init(struct cgroup *cgrp); -void cgroup_stat_exit(struct cgroup *cgrp); -void cgroup_stat_show_cputime(struct seq_file *seq); -void cgroup_stat_boot(void); +int cgroup_rstat_init(struct cgroup *cgrp); +void cgroup_rstat_exit(struct cgroup *cgrp); +void cgroup_rstat_boot(void); +void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e06c97f3ed1a..8b4f0768efd6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -195,9 +195,9 @@ struct cgroup_pidlist { static void *pidlist_allocate(int count) { if (PIDLIST_TOO_LARGE(count)) - return vmalloc(count * sizeof(pid_t)); + return vmalloc(array_size(count, sizeof(pid_t))); else - return kmalloc(count * sizeof(pid_t), GFP_KERNEL); + return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL); } static void pidlist_free(void *p) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 12883656e63e..077370bf8964 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/sched/cputime.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -61,6 +62,8 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) +/* let's not notify more than 100 times per second */ +#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * cgroup_mutex is the master lock. Any modification to cgroup or its @@ -142,14 +145,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; +struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -1554,6 +1557,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); + + del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -1573,8 +1578,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - list_for_each_entry(cfts, &css->ss->cfts, node) + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_base_files; + else + cfts = cgroup1_base_files; + cgroup_addrm_files(css, cgrp, cfts, false); + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); + } } /** @@ -1598,14 +1612,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css) else cfts = cgroup1_base_files; - return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - } - - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; + ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + if (ret < 0) + return ret; + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } } @@ -1782,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed @@ -1797,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void) * tasklist if we walk through it with RCU. */ read_lock(&tasklist_lock); + spin_lock_irq(&css_set_lock); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + do_each_thread(g, p) { WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1824,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void) } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); out_unlock: spin_unlock_irq(&css_set_lock); + read_unlock(&tasklist_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -1844,6 +1860,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + INIT_LIST_HEAD(&cgrp->rstat_css_list); + prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3381,7 +3399,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v) struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; - cgroup_stat_show_cputime(seq); + cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); #endif @@ -3521,6 +3539,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } +static void cgroup_file_notify_timer(struct timer_list *timer) +{ + cgroup_file_notify(container_of(timer, struct cgroup_file, + notify_timer)); +} + static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { @@ -3547,6 +3571,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; + timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); @@ -3796,8 +3822,17 @@ void cgroup_file_notify(struct cgroup_file *cfile) unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); - if (cfile->kn) - kernfs_notify(cfile->kn); + if (cfile->kn) { + unsigned long last = cfile->notified_at; + unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + } else { + kernfs_notify(cfile->kn); + cfile->notified_at = jiffies; + } + } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } @@ -4560,7 +4595,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); if (cgroup_on_dfl(cgrp)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4587,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ + if (!list_empty(&css->rstat_css_node)) { + cgroup_rstat_flush(cgrp); + list_del_rcu(&css->rstat_css_node); + } + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); @@ -4597,7 +4637,7 @@ static void css_release_work_fn(struct work_struct *work) trace_cgroup_release(cgrp); if (cgroup_on_dfl(cgrp)) - cgroup_stat_flush(cgrp); + cgroup_rstat_flush(cgrp); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) @@ -4648,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -4656,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } + if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); + BUG_ON(cgroup_css(cgrp, ss)); } @@ -4757,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: + list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); @@ -4775,8 +4820,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent) int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp) + - sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), + GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -4785,7 +4830,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_free_cgrp; if (cgroup_on_dfl(parent)) { - ret = cgroup_stat_init(cgrp); + ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; } @@ -4850,7 +4895,7 @@ out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: if (cgroup_on_dfl(parent)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5090,10 +5135,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* - * Remove @cgrp directory along with the base files. @cgrp has an - * extra ref on its kn. - */ + /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ + css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (parent && cgroup_is_threaded(cgrp)) @@ -5245,7 +5288,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_stat_boot(); + cgroup_rstat_boot(); /* * The latency of the synchronize_sched() is too high for cgroups, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b42037e6e81d..266f10cb7222 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -605,7 +605,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt + * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this @@ -683,7 +683,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); + csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -753,7 +753,8 @@ restart: * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ - dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); + dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), + GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..d3bbb757ee49 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; - const char **table = &rdmacg_resource_names[0]; char *name, *value = c; size_t len; - int ret, i = 0; + int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; - len = strlen(value); + i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); + if (i < 0) + return i; - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; + len = strlen(value); - argstr.from = value; - argstr.to = value + len; + argstr.from = value; + argstr.to = value + len; - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + return -EINVAL; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; } return -EINVAL; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c new file mode 100644 index 000000000000..d503d1a9007c --- /dev/null +++ b/kernel/cgroup/rstat.c @@ -0,0 +1,416 @@ +#include "cgroup-internal.h" + +#include <linux/sched/cputime.h> + +static DEFINE_SPINLOCK(cgroup_rstat_lock); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); + +static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_cpu, cpu); +} + +/** + * cgroup_rstat_updated - keep track of updated rstat_cpu + * @cgrp: target cgroup + * @cpu: cpu on which rstat_cpu was updated + * + * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * cgroup_rstat_cpu definition for details. + */ +void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* nothing to do for root */ + if (!cgroup_parent(cgrp)) + return; + + /* + * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * see NULL updated_next or they see our updated stat. + */ + smp_mb(); + + /* + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (rstatc->updated_next) + break; + + rstatc->updated_next = prstatc->updated_children; + prstatc->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} +EXPORT_SYMBOL_GPL(cgroup_rstat_updated); + +/** + * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_rstat_cpu_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + rstatc = cgroup_rstat_cpu(pos, cpu); + if (rstatc->updated_children == pos) + break; + pos = rstatc->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && rstatc->updated_next) { + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup_rstat_cpu *nrstatc; + struct cgroup **nextp; + + nextp = &prstatc->updated_children; + while (true) { + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + + *nextp = rstatc->updated_next; + rstatc->updated_next = NULL; + + /* + * Paired with the one in cgroup_rstat_cpu_updated(). + * Either they see NULL updated_next or we see their + * updated stat. + */ + smp_mb(); + } + + return pos; +} + +/* see cgroup_rstat_flush() */ +static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) +{ + int cpu; + + lockdep_assert_held(&cgroup_rstat_lock); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; + + raw_spin_lock(cpu_lock); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + + cgroup_base_stat_flush(pos, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } + raw_spin_unlock(cpu_lock); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || + spin_needbreak(&cgroup_rstat_lock))) { + spin_unlock_irq(&cgroup_rstat_lock); + if (!cond_resched()) + cpu_relax(); + spin_lock_irq(&cgroup_rstat_lock); + } + } +} + +/** + * cgroup_rstat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + * + * This function may block. + */ +void cgroup_rstat_flush(struct cgroup *cgrp) +{ + might_sleep(); + + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); + spin_unlock_irq(&cgroup_rstat_lock); +} + +/** + * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() + * @cgrp: target cgroup + * + * This function can be called from any context. + */ +void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_rstat_lock, flags); + cgroup_rstat_flush_locked(cgrp, false); + spin_unlock_irqrestore(&cgroup_rstat_lock, flags); +} + +/** + * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold + * @cgrp: target cgroup + * + * Flush stats in @cgrp's subtree and prevent further flushes. Must be + * paired with cgroup_rstat_flush_release(). + * + * This function may block. + */ +void cgroup_rstat_flush_hold(struct cgroup *cgrp) + __acquires(&cgroup_rstat_lock) +{ + might_sleep(); + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); +} + +/** + * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + */ +void cgroup_rstat_flush_release(void) + __releases(&cgroup_rstat_lock) +{ + spin_unlock_irq(&cgroup_rstat_lock); +} + +int cgroup_rstat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has rstat_cpu preallocated */ + if (!cgrp->rstat_cpu) { + cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); + if (!cgrp->rstat_cpu) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + rstatc->updated_children = cgrp; + u64_stats_init(&rstatc->bsync); + } + + return 0; +} + +void cgroup_rstat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_rstat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + WARN_ON_ONCE(rstatc->updated_next)) + return; + } + + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; +} + +void __init cgroup_rstat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + + BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); +} + +/* + * Functions for cgroup basic resource statistics implemented on top of + * rstat. + */ +static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) +{ + dst_bstat->cputime.utime += src_bstat->cputime.utime; + dst_bstat->cputime.stime += src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; +} + +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; + struct task_cputime cputime; + struct cgroup_base_stat delta; + unsigned seq; + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&rstatc->bsync); + cputime = rstatc->bstat.cputime; + } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + + /* calculate the delta to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); + memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_base_stat_accumulate(&cgrp->bstat, &delta); + if (parent) + cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); +} + +static struct cgroup_rstat_cpu * +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = get_cpu_ptr(cgrp->rstat_cpu); + u64_stats_update_begin(&rstatc->bsync); + return rstatc; +} + +static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, + struct cgroup_rstat_cpu *rstatc) +{ + u64_stats_update_end(&rstatc->bsync); + cgroup_rstat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(rstatc); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_rstat_cpu *rstatc; + + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + rstatc->bstat.cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + rstatc->bstat.cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_base_stat_cputime_account_end(cgrp, rstatc); +} + +void cgroup_base_stat_cputime_show(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + cgroup_rstat_flush_hold(cgrp); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); + cgroup_rstat_flush_release(); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c deleted file mode 100644 index 1e111dd455c4..000000000000 --- a/kernel/cgroup/stat.c +++ /dev/null @@ -1,338 +0,0 @@ -#include "cgroup-internal.h" - -#include <linux/sched/cputime.h> - -static DEFINE_MUTEX(cgroup_stat_mutex); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); - -static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) -{ - return per_cpu_ptr(cgrp->cpu_stat, cpu); -} - -/** - * cgroup_cpu_stat_updated - keep track of updated cpu_stat - * @cgrp: target cgroup - * @cpu: cpu on which cpu_stat was updated - * - * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching - * cpu_stat->updated_children list. See the comment on top of - * cgroup_cpu_stat definition for details. - */ -static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *parent; - unsigned long flags; - - /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. - * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. - */ - if (cgroup_cpu_stat(cgrp, cpu)->updated_next) - return; - - raw_spin_lock_irqsave(cpu_lock, flags); - - /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - - /* - * Both additions and removals are bottom-up. If a cgroup - * is already in the tree, all ancestors are. - */ - if (cstat->updated_next) - break; - - cstat->updated_next = pcstat->updated_children; - pcstat->updated_children = cgrp; - } - - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - -/** - * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree - * @pos: current position - * @root: root of the tree to traversal - * @cpu: target cpu - * - * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_cpu_stat_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. - */ -static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) -{ - struct cgroup_cpu_stat *cstat; - struct cgroup *parent; - - if (pos == root) - return NULL; - - /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. - */ - if (!pos) - pos = root; - else - pos = cgroup_parent(pos); - - /* walk down to the first leaf */ - while (true) { - cstat = cgroup_cpu_stat(pos, cpu); - if (cstat->updated_children == pos) - break; - pos = cstat->updated_children; - } - - /* - * Unlink @pos from the tree. As the updated_children list is - * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. - */ - parent = cgroup_parent(pos); - if (parent && cstat->updated_next) { - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - struct cgroup_cpu_stat *ncstat; - struct cgroup **nextp; - - nextp = &pcstat->updated_children; - while (true) { - ncstat = cgroup_cpu_stat(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &ncstat->updated_next; - } - - *nextp = cstat->updated_next; - cstat->updated_next = NULL; - } - - return pos; -} - -static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, - struct cgroup_stat *src_stat) -{ - dst_stat->cputime.utime += src_stat->cputime.utime; - dst_stat->cputime.stime += src_stat->cputime.stime; - dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; -} - -static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct task_cputime *last_cputime = &cstat->last_cputime; - struct task_cputime cputime; - struct cgroup_stat delta; - unsigned seq; - - lockdep_assert_held(&cgroup_stat_mutex); - - /* fetch the current per-cpu values */ - do { - seq = __u64_stats_fetch_begin(&cstat->sync); - cputime = cstat->cputime; - } while (__u64_stats_fetch_retry(&cstat->sync, seq)); - - /* accumulate the deltas to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_stat_accumulate(&delta, &cgrp->pending_stat); - memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_stat_accumulate(&cgrp->stat, &delta); - if (parent) - cgroup_stat_accumulate(&parent->pending_stat, &delta); -} - -/* see cgroup_stat_flush() */ -static void cgroup_stat_flush_locked(struct cgroup *cgrp) -{ - int cpu; - - lockdep_assert_held(&cgroup_stat_mutex); - - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *pos = NULL; - - raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) - cgroup_cpu_stat_flush_one(pos, cpu); - raw_spin_unlock_irq(cpu_lock); - } -} - -/** - * cgroup_stat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - */ -void cgroup_stat_flush(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_stat_mutex); - cgroup_stat_flush_locked(cgrp); - mutex_unlock(&cgroup_stat_mutex); -} - -static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) -{ - struct cgroup_cpu_stat *cstat; - - cstat = get_cpu_ptr(cgrp->cpu_stat); - u64_stats_update_begin(&cstat->sync); - return cstat; -} - -static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_cpu_stat *cstat) -{ - u64_stats_update_end(&cstat->sync); - cgroup_cpu_stat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(cstat); -} - -void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - cstat->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void __cgroup_account_cputime_field(struct cgroup *cgrp, - enum cpu_usage_stat index, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - - switch (index) { - case CPUTIME_USER: - case CPUTIME_NICE: - cstat->cputime.utime += delta_exec; - break; - case CPUTIME_SYSTEM: - case CPUTIME_IRQ: - case CPUTIME_SOFTIRQ: - cstat->cputime.stime += delta_exec; - break; - default: - break; - } - - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void cgroup_stat_show_cputime(struct seq_file *seq) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - mutex_lock(&cgroup_stat_mutex); - - cgroup_stat_flush_locked(cgrp); - - usage = cgrp->stat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, - &utime, &stime); - - mutex_unlock(&cgroup_stat_mutex); - - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - - seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); -} - -int cgroup_stat_init(struct cgroup *cgrp) -{ - int cpu; - - /* the root cgrp has cpu_stat preallocated */ - if (!cgrp->cpu_stat) { - cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); - if (!cgrp->cpu_stat) - return -ENOMEM; - } - - /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - cstat->updated_children = cgrp; - u64_stats_init(&cstat->sync); - } - - prev_cputime_init(&cgrp->stat.prev_cputime); - - return 0; -} - -void cgroup_stat_exit(struct cgroup *cgrp) -{ - int cpu; - - cgroup_stat_flush(cgrp); - - /* sanity check */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - if (WARN_ON_ONCE(cstat->updated_children != cgrp) || - WARN_ON_ONCE(cstat->updated_next)) - return; - } - - free_percpu(cgrp->cpu_stat); - cgrp->cpu_stat = NULL; -} - -void __init cgroup_stat_boot(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); - - BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); -} diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 946fb92418f7..81e9af7dcec2 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -12,7 +12,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_CC_STACKPROTECTOR_STRONG=y +CONFIG_STACKPROTECTOR_STRONG=y CONFIG_COMPACTION=y CONFIG_CPU_SW_DOMAIN_PAN=y CONFIG_DM_CRYPT=y diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 9bfdffc100da..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -10,7 +10,3 @@ CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y -CONFIG_CC_STACKPROTECTOR_NONE=y -# CONFIG_CC_STACKPROTECTOR_REGULAR is not set -# CONFIG_CC_STACKPROTECTOR_STRONG is not set -# CONFIG_CC_STACKPROTECTOR_AUTO is not set diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d676889..b66aced5e8c2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e405677ee08d..2ddfce8f1e8f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -691,7 +691,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) } if (!s->usable) return KDB_NOTIMP; - s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); + s->command = kcalloc(s->count + 1, sizeof(*(s->command)), GFP_KDB); if (!s->command) { kdb_printf("Could not allocate new kdb_defcmd table for %s\n", cmdstr); @@ -729,8 +729,8 @@ static int kdb_defcmd(int argc, const char **argv) kdb_printf("Command only available during kdb_init()\n"); return KDB_NOTIMP; } - defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), - GFP_KDB); + defcmd_set = kmalloc_array(defcmd_set_count + 1, sizeof(*defcmd_set), + GFP_KDB); if (!defcmd_set) goto fail_defcmd; memcpy(defcmd_set, save_defcmd_set, @@ -2706,8 +2706,11 @@ int kdb_register_flags(char *cmd, } if (i >= kdb_max_commands) { - kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX + - kdb_command_extend) * sizeof(*new), GFP_KDB); + kdbtab_t *new = kmalloc_array(kdb_max_commands - + KDB_BASE_CMD_MAX + + kdb_command_extend, + sizeof(*new), + GFP_KDB); if (!new) { kdb_printf("Could not allocate new kdb_command " "table\n"); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig new file mode 100644 index 000000000000..9bd54304446f --- /dev/null +++ b/kernel/dma/Kconfig @@ -0,0 +1,50 @@ + +config HAS_DMA + bool + depends on !NO_DMA + default y + +config NEED_SG_DMA_LENGTH + bool + +config NEED_DMA_MAP_STATE + bool + +config ARCH_DMA_ADDR_T_64BIT + def_bool 64BIT || PHYS_ADDR_T_64BIT + +config HAVE_GENERIC_DMA_COHERENT + bool + +config ARCH_HAS_SYNC_DMA_FOR_DEVICE + bool + +config ARCH_HAS_SYNC_DMA_FOR_CPU + bool + select NEED_DMA_MAP_STATE + +config DMA_DIRECT_OPS + bool + depends on HAS_DMA + +config DMA_NONCOHERENT_OPS + bool + depends on HAS_DMA + select DMA_DIRECT_OPS + +config DMA_NONCOHERENT_MMAP + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_NONCOHERENT_CACHE_SYNC + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_VIRT_OPS + bool + depends on HAS_DMA + +config SWIOTLB + bool + select DMA_DIRECT_OPS + select NEED_DMA_MAP_STATE diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile new file mode 100644 index 000000000000..6de44e4eb454 --- /dev/null +++ b/kernel/dma/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_HAS_DMA) += mapping.o +obj-$(CONFIG_DMA_CMA) += contiguous.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o +obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o +obj-$(CONFIG_DMA_VIRT_OPS) += virt.o +obj-$(CONFIG_DMA_API_DEBUG) += debug.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o + diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c new file mode 100644 index 000000000000..597d40893862 --- /dev/null +++ b/kernel/dma/coherent.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/dma-mapping.h> + +struct dma_coherent_mem { + void *virt_base; + dma_addr_t device_base; + unsigned long pfn_base; + int size; + int flags; + unsigned long *bitmap; + spinlock_t spinlock; + bool use_dev_dma_pfn_offset; +}; + +static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; + +static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) +{ + if (dev && dev->dma_mem) + return dev->dma_mem; + return NULL; +} + +static inline dma_addr_t dma_get_device_base(struct device *dev, + struct dma_coherent_mem * mem) +{ + if (mem->use_dev_dma_pfn_offset) + return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; + else + return mem->device_base; +} + +static int dma_init_coherent_memory( + phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, + struct dma_coherent_mem **mem) +{ + struct dma_coherent_mem *dma_mem = NULL; + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + int ret; + + if (!size) { + ret = -EINVAL; + goto out; + } + + mem_base = memremap(phys_addr, size, MEMREMAP_WC); + if (!mem_base) { + ret = -EINVAL; + goto out; + } + dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dma_mem) { + ret = -ENOMEM; + goto out; + } + dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dma_mem->bitmap) { + ret = -ENOMEM; + goto out; + } + + dma_mem->virt_base = mem_base; + dma_mem->device_base = device_addr; + dma_mem->pfn_base = PFN_DOWN(phys_addr); + dma_mem->size = pages; + dma_mem->flags = flags; + spin_lock_init(&dma_mem->spinlock); + + *mem = dma_mem; + return 0; + +out: + kfree(dma_mem); + if (mem_base) + memunmap(mem_base); + return ret; +} + +static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +{ + if (!mem) + return; + + memunmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} + +static int dma_assign_coherent_memory(struct device *dev, + struct dma_coherent_mem *mem) +{ + if (!dev) + return -ENODEV; + + if (dev->dma_mem) + return -EBUSY; + + dev->dma_mem = mem; + return 0; +} + +int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + struct dma_coherent_mem *mem; + int ret; + + ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + if (ret) + return ret; + + ret = dma_assign_coherent_memory(dev, mem); + if (ret) + dma_release_coherent_memory(mem); + return ret; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dma_release_coherent_memory(mem); + dev->dma_mem = NULL; +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + unsigned long flags; + int pos, err; + + size += device_addr & ~PAGE_MASK; + + if (!mem) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&mem->spinlock, flags); + pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); + spin_unlock_irqrestore(&mem->spinlock, flags); + + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) +{ + int order = get_order(size); + unsigned long flags; + int pageno; + void *ret; + + spin_lock_irqsave(&mem->spinlock, flags); + + if (unlikely(size > (mem->size << PAGE_SHIFT))) + goto err; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (unlikely(pageno < 0)) + goto err; + + /* + * Memory was found in the coherent area. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + ret = mem->virt_base + (pageno << PAGE_SHIFT); + spin_unlock_irqrestore(&mem->spinlock, flags); + memset(ret, 0, size); + return ret; +err: + spin_unlock_irqrestore(&mem->spinlock, flags); + return NULL; +} + +/** + * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. + */ +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + if (!mem) + return 0; + + *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + if (*ret) + return 1; + + /* + * In the case where the allocation can not be satisfied from the + * per-device area, try to fall back to generic memory if the + * constraints allow it. + */ + return mem->flags & DMA_MEMORY_EXCLUSIVE; +} +EXPORT_SYMBOL(dma_alloc_from_dev_coherent); + +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +{ + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dma_coherent_default_memory, size, + dma_handle); +} + +static int __dma_release_from_coherent(struct dma_coherent_mem *mem, + int order, void *vaddr) +{ + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + unsigned long flags; + + spin_lock_irqsave(&mem->spinlock, flags); + bitmap_release_region(mem->bitmap, page, order); + spin_unlock_irqrestore(&mem->spinlock, flags); + return 1; + } + return 0; +} + +/** + * dma_release_from_dev_coherent() - free memory to device coherent memory pool + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if the caller should + * proceed with releasing memory from generic pools. + */ +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_release_from_coherent(mem, order, vaddr); +} +EXPORT_SYMBOL(dma_release_from_dev_coherent); + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + +static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, + struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) +{ + if (mem && vaddr >= mem->virt_base && vaddr + size <= + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + unsigned long off = vma->vm_pgoff; + int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; + int user_count = vma_pages(vma); + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + *ret = -ENXIO; + if (off < count && user_count <= count - off) { + unsigned long pfn = mem->pfn_base + start + off; + *ret = remap_pfn_range(vma, vma->vm_start, pfn, + user_count << PAGE_SHIFT, + vma->vm_page_prot); + } + return 1; + } + return 0; +} + +/** + * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool + * @dev: device from which the memory was allocated + * @vma: vm_area for the userspace memory + * @vaddr: cpu address returned by dma_alloc_from_dev_coherent + * @size: size of the memory buffer allocated + * @ret: result from remap_pfn_range() + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, maps that memory to the provided vma. + * + * Returns 1 if @vaddr belongs to the device coherent pool and the caller + * should return @ret, or 0 if they should proceed with mapping memory from + * generic areas. + */ +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, + void *vaddr, size_t size, int *ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); +} +EXPORT_SYMBOL(dma_mmap_from_dev_coherent); + +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, + size_t size, int *ret) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, + vaddr, size, ret); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_reserved_mem.h> + +static struct reserved_mem *dma_reserved_default_memory __initdata; + +static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + struct dma_coherent_mem *mem = rmem->priv; + int ret; + + if (!mem) { + ret = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + if (ret) { + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return ret; + } + } + mem->use_dev_dma_pfn_offset = true; + rmem->priv = mem; + dma_assign_coherent_memory(dev, mem); + return 0; +} + +static void rmem_dma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + if (dev) + dev->dma_mem = NULL; +} + +static const struct reserved_mem_ops rmem_dma_ops = { + .device_init = rmem_dma_device_init, + .device_release = rmem_dma_device_release, +}; + +static int __init rmem_dma_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL)) + return -EINVAL; + +#ifdef CONFIG_ARM + if (!of_get_flat_dt_prop(node, "no-map", NULL)) { + pr_err("Reserved memory: regions without no-map are not yet supported\n"); + return -EINVAL; + } + + if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { + WARN(dma_reserved_default_memory, + "Reserved memory: region for default DMA coherent area is redefined\n"); + dma_reserved_default_memory = rmem; + } +#endif + + rmem->ops = &rmem_dma_ops; + pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +static int __init dma_init_reserved_memory(void) +{ + const struct reserved_mem_ops *ops; + int ret; + + if (!dma_reserved_default_memory) + return -ENOMEM; + + ops = dma_reserved_default_memory->ops; + + /* + * We rely on rmem_dma_device_init() does not propagate error of + * dma_assign_coherent_memory() for "NULL" device. + */ + ret = ops->device_init(dma_reserved_default_memory, NULL); + + if (!ret) { + dma_coherent_default_memory = dma_reserved_default_memory->priv; + pr_info("DMA: default coherent area is set\n"); + } + + return ret; +} + +core_initcall(dma_init_reserved_memory); + +RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); +#endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c new file mode 100644 index 000000000000..d987dcd1bd56 --- /dev/null +++ b/kernel/dma/contiguous.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Contiguous Memory Allocator for DMA mapping framework + * Copyright (c) 2010-2011 by Samsung Electronics. + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <asm/page.h> +#include <asm/dma-contiguous.h> + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/sizes.h> +#include <linux/dma-contiguous.h> +#include <linux/cma.h> + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +struct cma *dma_contiguous_default_area; + +/* + * Default global CMA area size can be defined in kernel's .config. + * This is useful mainly for distro maintainers to create a kernel + * that works correctly for most supported systems. + * The size can be set in bytes or as a percentage of the total memory + * in the system. + * + * Users, who want to set the size of global CMA area for their system + * should use cma= kernel parameter. + */ +static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline = -1; +static phys_addr_t base_cmdline; +static phys_addr_t limit_cmdline; + +static int __init early_cma(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + size_cmdline = memparse(p, &p); + if (*p != '@') + return 0; + base_cmdline = memparse(p + 1, &p); + if (*p != '-') { + limit_cmdline = base_cmdline + size_cmdline; + return 0; + } + limit_cmdline = memparse(p + 1, &p); + + return 0; +} +early_param("cma", early_cma); + +#ifdef CONFIG_CMA_SIZE_PERCENTAGE + +static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) +{ + struct memblock_region *reg; + unsigned long total_pages = 0; + + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + total_pages += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; +} + +#else + +static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) +{ + return 0; +} + +#endif + +/** + * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling + * @limit: End address of the reserved memory (optional, 0 for any). + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init dma_contiguous_reserve(phys_addr_t limit) +{ + phys_addr_t selected_size = 0; + phys_addr_t selected_base = 0; + phys_addr_t selected_limit = limit; + bool fixed = false; + + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); + + if (size_cmdline != -1) { + selected_size = size_cmdline; + selected_base = base_cmdline; + selected_limit = min_not_zero(limit_cmdline, limit); + if (base_cmdline + size_cmdline == limit_cmdline) + fixed = true; + } else { +#ifdef CONFIG_CMA_SIZE_SEL_MBYTES + selected_size = size_bytes; +#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) + selected_size = cma_early_percent_memory(); +#elif defined(CONFIG_CMA_SIZE_SEL_MIN) + selected_size = min(size_bytes, cma_early_percent_memory()); +#elif defined(CONFIG_CMA_SIZE_SEL_MAX) + selected_size = max(size_bytes, cma_early_percent_memory()); +#endif + } + + if (selected_size && !dma_contiguous_default_area) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + + dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); + } +} + +/** + * dma_contiguous_reserve_area() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas for specific + * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ + int ret; + + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, + "reserved", res_cma); + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); + + return 0; +} + +/** + * dma_alloc_from_contiguous() - allocate pages from contiguous area + * @dev: Pointer to device for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP flags to use for this allocation. + * + * This function allocates memory buffer for specified device. It uses + * device specific contiguous memory area if available or the default + * global one. Requires architecture specific dev_get_cma_area() helper + * function. + */ +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, + unsigned int align, gfp_t gfp_mask) +{ + if (align > CONFIG_CMA_ALIGNMENT) + align = CONFIG_CMA_ALIGNMENT; + + return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); +} + +/** + * dma_release_from_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by dma_alloc_from_contiguous(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + return cma_release(dev_get_cma_area(dev), pages, count); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_reserved_mem.h> + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + dev_set_cma_area(dev, rmem->priv); + return 0; +} + +static void rmem_cma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev_set_cma_area(dev, NULL); +} + +static const struct reserved_mem_ops rmem_cma_ops = { + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +static int __init rmem_cma_setup(struct reserved_mem *rmem) +{ + phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + phys_addr_t mask = align - 1; + unsigned long node = rmem->fdt_node; + struct cma *cma; + int err; + + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if ((rmem->base & mask) || (rmem->size & mask)) { + pr_err("Reserved memory: incorrect alignment of CMA region\n"); + return -EINVAL; + } + + err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + if (err) { + pr_err("Reserved memory: unable to setup CMA region\n"); + return err; + } + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(rmem->base, rmem->size); + + if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + dma_contiguous_set_default(cma); + + rmem->ops = &rmem_cma_ops; + rmem->priv = cma; + + pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + + return 0; +} +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); +#endif diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c new file mode 100644 index 000000000000..c007d25bee09 --- /dev/null +++ b/kernel/dma/debug.c @@ -0,0 +1,1773 @@ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/sched/task_stack.h> +#include <linux/scatterlist.h> +#include <linux/dma-mapping.h> +#include <linux/sched/task.h> +#include <linux/stacktrace.h> +#include <linux/dma-debug.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/export.h> +#include <linux/device.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/list.h> +#include <linux/slab.h> + +#include <asm/sections.h> + +#define HASH_SIZE 1024ULL +#define HASH_FN_SHIFT 13 +#define HASH_FN_MASK (HASH_SIZE - 1) + +/* allow architectures to override this if absolutely required */ +#ifndef PREALLOC_DMA_DEBUG_ENTRIES +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +#endif + +enum { + dma_debug_single, + dma_debug_page, + dma_debug_sg, + dma_debug_coherent, + dma_debug_resource, +}; + +enum map_err_types { + MAP_ERR_CHECK_NOT_APPLICABLE, + MAP_ERR_NOT_CHECKED, + MAP_ERR_CHECKED, +}; + +#define DMA_DEBUG_STACKTRACE_ENTRIES 5 + +/** + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping + * @list: node on pre-allocated free_entries list + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @type: single, page, sg, coherent + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn + * @size: length of the mapping + * @direction: enum dma_data_direction + * @sg_call_ents: 'nents' from dma_map_sg + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @map_err_type: track whether dma_mapping_error() was checked + * @stacktrace: support backtraces when a violation is detected + */ +struct dma_debug_entry { + struct list_head list; + struct device *dev; + int type; + unsigned long pfn; + size_t offset; + u64 dev_addr; + u64 size; + int direction; + int sg_call_ents; + int sg_mapped_ents; + enum map_err_types map_err_type; +#ifdef CONFIG_STACKTRACE + struct stack_trace stacktrace; + unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; +#endif +}; + +typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); + +struct hash_bucket { + struct list_head list; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + +/* Hash list to save the allocated dma addresses */ +static struct hash_bucket dma_entry_hash[HASH_SIZE]; +/* List of pre-allocated dma_debug_entry's */ +static LIST_HEAD(free_entries); +/* Lock for the list above */ +static DEFINE_SPINLOCK(free_entries_lock); + +/* Global disable flag - will be set in case of an error */ +static bool global_disable __read_mostly; + +/* Early initialization disable flag, set at the end of dma_debug_init */ +static bool dma_debug_initialized __read_mostly; + +static inline bool dma_debug_disabled(void) +{ + return global_disable || !dma_debug_initialized; +} + +/* Global error count */ +static u32 error_count; + +/* Global error show enable*/ +static u32 show_all_errors __read_mostly; +/* Number of errors to show */ +static u32 show_num_errors = 1; + +static u32 num_free_entries; +static u32 min_free_entries; +static u32 nr_total_entries; + +/* number of preallocated entries requested by kernel cmdline */ +static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + +/* debugfs dentry's for the stuff above */ +static struct dentry *dma_debug_dent __read_mostly; +static struct dentry *global_disable_dent __read_mostly; +static struct dentry *error_count_dent __read_mostly; +static struct dentry *show_all_errors_dent __read_mostly; +static struct dentry *show_num_errors_dent __read_mostly; +static struct dentry *num_free_entries_dent __read_mostly; +static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *filter_dent __read_mostly; + +/* per-driver filter related state */ + +#define NAME_MAX_LEN 64 + +static char current_driver_name[NAME_MAX_LEN] __read_mostly; +static struct device_driver *current_driver __read_mostly; + +static DEFINE_RWLOCK(driver_name_lock); + +static const char *const maperr2str[] = { + [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable", + [MAP_ERR_NOT_CHECKED] = "dma map error not checked", + [MAP_ERR_CHECKED] = "dma map error checked", +}; + +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; + +static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", + "DMA_FROM_DEVICE", "DMA_NONE" }; + +/* + * The access to some variables in this macro is racy. We can't use atomic_t + * here because all these variables are exported to debugfs. Some of them even + * writeable. This is also the reason why a lock won't help much. But anyway, + * the races are no big deal. Here is why: + * + * error_count: the addition is racy, but the worst thing that can happen is + * that we don't count some errors + * show_num_errors: the subtraction is racy. Also no big deal because in + * worst case this will result in one warning more in the + * system log than the user configured. This variable is + * writeable via debugfs. + */ +static inline void dump_entry_trace(struct dma_debug_entry *entry) +{ +#ifdef CONFIG_STACKTRACE + if (entry) { + pr_warning("Mapped at:\n"); + print_stack_trace(&entry->stacktrace, 0); + } +#endif +} + +static bool driver_filter(struct device *dev) +{ + struct device_driver *drv; + unsigned long flags; + bool ret; + + /* driver filter off */ + if (likely(!current_driver_name[0])) + return true; + + /* driver filter on and initialized */ + if (current_driver && dev && dev->driver == current_driver) + return true; + + /* driver filter on, but we can't filter on a NULL device... */ + if (!dev) + return false; + + if (current_driver || !current_driver_name[0]) + return false; + + /* driver filter on but not yet initialized */ + drv = dev->driver; + if (!drv) + return false; + + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); + + ret = false; + if (drv->name && + strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { + current_driver = drv; + ret = true; + } + + read_unlock_irqrestore(&driver_name_lock, flags); + + return ret; +} + +#define err_printk(dev, entry, format, arg...) do { \ + error_count += 1; \ + if (driver_filter(dev) && \ + (show_all_errors || show_num_errors > 0)) { \ + WARN(1, "%s %s: " format, \ + dev ? dev_driver_string(dev) : "NULL", \ + dev ? dev_name(dev) : "NULL", ## arg); \ + dump_entry_trace(entry); \ + } \ + if (!show_all_errors && show_num_errors > 0) \ + show_num_errors -= 1; \ + } while (0); + +/* + * Hash related functions + * + * Every DMA-API request is saved into a struct dma_debug_entry. To + * have quick access to these structs they are stored into a hash. + */ +static int hash_fn(struct dma_debug_entry *entry) +{ + /* + * Hash function is based on the dma address. + * We use bits 20-27 here as the index into the hash + */ + return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; +} + +/* + * Request exclusive access to a hash bucket for a given dma_debug_entry. + */ +static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, + unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) +{ + int idx = hash_fn(entry); + unsigned long __flags; + + spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); + *flags = __flags; + return &dma_entry_hash[idx]; +} + +/* + * Give up exclusive access to the hash bucket + */ +static void put_hash_bucket(struct hash_bucket *bucket, + unsigned long *flags) + __releases(&bucket->lock) +{ + unsigned long __flags = *flags; + + spin_unlock_irqrestore(&bucket->lock, __flags); +} + +static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) +{ + return ((a->dev_addr == b->dev_addr) && + (a->dev == b->dev)) ? true : false; +} + +static bool containing_match(struct dma_debug_entry *a, + struct dma_debug_entry *b) +{ + if (a->dev != b->dev) + return false; + + if ((b->dev_addr <= a->dev_addr) && + ((b->dev_addr + b->size) >= (a->dev_addr + a->size))) + return true; + + return false; +} + +/* + * Search a given entry in the hash bucket list + */ +static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, + struct dma_debug_entry *ref, + match_fn match) +{ + struct dma_debug_entry *entry, *ret = NULL; + int matches = 0, match_lvl, last_lvl = -1; + + list_for_each_entry(entry, &bucket->list, list) { + if (!match(ref, entry)) + continue; + + /* + * Some drivers map the same physical address multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which returns the entry from + * the hash which fits best to the reference value + * instead of the first-fit. + */ + matches += 1; + match_lvl = 0; + entry->size == ref->size ? ++match_lvl : 0; + entry->type == ref->type ? ++match_lvl : 0; + entry->direction == ref->direction ? ++match_lvl : 0; + entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0; + + if (match_lvl == 4) { + /* perfect-fit - return the result */ + return entry; + } else if (match_lvl > last_lvl) { + /* + * We found an entry that fits better then the + * previous one or it is the 1st match. + */ + last_lvl = match_lvl; + ret = entry; + } + } + + /* + * If we have multiple matches but no perfect-fit, just return + * NULL. + */ + ret = (matches == 1) ? ret : NULL; + + return ret; +} + +static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket, + struct dma_debug_entry *ref) +{ + return __hash_bucket_find(bucket, ref, exact_match); +} + +static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, + struct dma_debug_entry *ref, + unsigned long *flags) +{ + + unsigned int max_range = dma_get_max_seg_size(ref->dev); + struct dma_debug_entry *entry, index = *ref; + unsigned int range = 0; + + while (range <= max_range) { + entry = __hash_bucket_find(*bucket, ref, containing_match); + + if (entry) + return entry; + + /* + * Nothing found, go back a hash bucket + */ + put_hash_bucket(*bucket, flags); + range += (1 << HASH_FN_SHIFT); + index.dev_addr -= (1 << HASH_FN_SHIFT); + *bucket = get_hash_bucket(&index, flags); + } + + return NULL; +} + +/* + * Add an entry to a hash bucket + */ +static void hash_bucket_add(struct hash_bucket *bucket, + struct dma_debug_entry *entry) +{ + list_add_tail(&entry->list, &bucket->list); +} + +/* + * Remove entry from a hash bucket list + */ +static void hash_bucket_del(struct dma_debug_entry *entry) +{ + list_del(&entry->list); +} + +static unsigned long long phys_addr(struct dma_debug_entry *entry) +{ + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; +} + +/* + * Dump mapping entries for debugging purposes + */ +void debug_dma_dump_mappings(struct device *dev) +{ + int idx; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!dev || dev == entry->dev) { + dev_info(entry->dev, + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + } + + spin_unlock_irqrestore(&bucket->lock, flags); + } +} + +/* + * For each mapping (initial cacheline in the case of + * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a + * scatterlist, or the cacheline specified in dma_map_single) insert + * into this tree using the cacheline as the key. At + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If + * the entry already exists at insertion time add a tag as a reference + * count for the overlapping mappings. For now, the overlap tracking + * just ensures that 'unmaps' balance 'maps' before marking the + * cacheline idle, but we should also be flagging overlaps as an API + * violation. + * + * Memory usage is mostly constrained by the maximum number of available + * dma-debug entries in that we need a free dma_debug_entry before + * inserting into the tree. In the case of dma_map_page and + * dma_alloc_coherent there is only one dma_debug_entry and one + * dma_active_cacheline entry to track per event. dma_map_sg(), on the + * other hand, consumes a single dma_debug_entry, but inserts 'nents' + * entries into the tree. + * + * At any time debug_dma_assert_idle() can be called to trigger a + * warning if any cachelines in the given page are in the active set. + */ +static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static DEFINE_SPINLOCK(radix_lock); +#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) +#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) +#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT) + +static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) +{ + return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + + (entry->offset >> L1_CACHE_SHIFT); +} + +static int active_cacheline_read_overlap(phys_addr_t cln) +{ + int overlap = 0, i; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (radix_tree_tag_get(&dma_active_cacheline, cln, i)) + overlap |= 1 << i; + return overlap; +} + +static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) +{ + int i; + + if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0) + return overlap; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (overlap & 1 << i) + radix_tree_tag_set(&dma_active_cacheline, cln, i); + else + radix_tree_tag_clear(&dma_active_cacheline, cln, i); + + return overlap; +} + +static void active_cacheline_inc_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + overlap = active_cacheline_set_overlap(cln, ++overlap); + + /* If we overflowed the overlap counter then we're potentially + * leaking dma-mappings. Otherwise, if maps and unmaps are + * balanced then this overflow may cause false negatives in + * debug_dma_assert_idle() as the cacheline may be marked idle + * prematurely. + */ + WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", + ACTIVE_CACHELINE_MAX_OVERLAP, &cln); +} + +static int active_cacheline_dec_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + return active_cacheline_set_overlap(cln, --overlap); +} + +static int active_cacheline_insert(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + int rc; + + /* If the device is not writing memory then we don't have any + * concerns about the cpu consuming stale data. This mitigates + * legitimate usages of overlapping mappings. + */ + if (entry->direction == DMA_TO_DEVICE) + return 0; + + spin_lock_irqsave(&radix_lock, flags); + rc = radix_tree_insert(&dma_active_cacheline, cln, entry); + if (rc == -EEXIST) + active_cacheline_inc_overlap(cln); + spin_unlock_irqrestore(&radix_lock, flags); + + return rc; +} + +static void active_cacheline_remove(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + + /* ...mirror the insert case */ + if (entry->direction == DMA_TO_DEVICE) + return; + + spin_lock_irqsave(&radix_lock, flags); + /* since we are counting overlaps the final put of the + * cacheline will occur when the overlap count is 0. + * active_cacheline_dec_overlap() returns -1 in that case + */ + if (active_cacheline_dec_overlap(cln) < 0) + radix_tree_delete(&dma_active_cacheline, cln); + spin_unlock_irqrestore(&radix_lock, flags); +} + +/** + * debug_dma_assert_idle() - assert that a page is not undergoing dma + * @page: page to lookup in the dma_active_cacheline tree + * + * Place a call to this routine in cases where the cpu touching the page + * before the dma completes (page is dma_unmapped) will lead to data + * corruption. + */ +void debug_dma_assert_idle(struct page *page) +{ + static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; + struct dma_debug_entry *entry = NULL; + void **results = (void **) &ents; + unsigned int nents, i; + unsigned long flags; + phys_addr_t cln; + + if (dma_debug_disabled()) + return; + + if (!page) + return; + + cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; + spin_lock_irqsave(&radix_lock, flags); + nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, + CACHELINES_PER_PAGE); + for (i = 0; i < nents; i++) { + phys_addr_t ent_cln = to_cacheline_number(ents[i]); + + if (ent_cln == cln) { + entry = ents[i]; + break; + } else if (ent_cln >= cln + CACHELINES_PER_PAGE) + break; + } + spin_unlock_irqrestore(&radix_lock, flags); + + if (!entry) + return; + + cln = to_cacheline_number(entry); + err_printk(entry->dev, entry, + "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", + &cln); +} + +/* + * Wrapper function for adding an entry to the hash. + * This function takes care of locking itself. + */ +static void add_dma_entry(struct dma_debug_entry *entry) +{ + struct hash_bucket *bucket; + unsigned long flags; + int rc; + + bucket = get_hash_bucket(entry, &flags); + hash_bucket_add(bucket, entry); + put_hash_bucket(bucket, &flags); + + rc = active_cacheline_insert(entry); + if (rc == -ENOMEM) { + pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); + global_disable = true; + } + + /* TODO: report -EEXIST errors here as overlapping mappings are + * not supported by the DMA API + */ +} + +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + +/* struct dma_entry allocator + * + * The next two functions implement the allocator for + * struct dma_debug_entries. + */ +static struct dma_debug_entry *dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&free_entries_lock, flags); + + if (list_empty(&free_entries)) { + global_disable = true; + spin_unlock_irqrestore(&free_entries_lock, flags); + pr_err("DMA-API: debugging out of memory - disabling\n"); + return NULL; + } + + entry = __dma_entry_alloc(); + + spin_unlock_irqrestore(&free_entries_lock, flags); + +#ifdef CONFIG_STACKTRACE + entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; + entry->stacktrace.entries = entry->st_entries; + entry->stacktrace.skip = 2; + save_stack_trace(&entry->stacktrace); +#endif + + return entry; +} + +static void dma_entry_free(struct dma_debug_entry *entry) +{ + unsigned long flags; + + active_cacheline_remove(entry); + + /* + * add to beginning of the list - this way the entries are + * more likely cache hot when they are reallocated. + */ + spin_lock_irqsave(&free_entries_lock, flags); + list_add(&entry->list, &free_entries); + num_free_entries += 1; + spin_unlock_irqrestore(&free_entries_lock, flags); +} + +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} + +/* + * DMA-API debugging init code + * + * The init code does two things: + * 1. Initialize core data structures + * 2. Preallocate a given number of dma_debug_entry structs + */ + +static int prealloc_memory(u32 num_entries) +{ + struct dma_debug_entry *entry, *next_entry; + int i; + + for (i = 0; i < num_entries; ++i) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_err; + + list_add_tail(&entry->list, &free_entries); + } + + num_free_entries = num_entries; + min_free_entries = num_entries; + + pr_info("DMA-API: preallocated %d debug entries\n", num_entries); + + return 0; + +out_err: + + list_for_each_entry_safe(entry, next_entry, &free_entries, list) { + list_del(&entry->list); + kfree(entry); + } + + return -ENOMEM; +} + +static ssize_t filter_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN + 1]; + unsigned long flags; + int len; + + if (!current_driver_name[0]) + return 0; + + /* + * We can't copy to userspace directly because current_driver_name can + * only be read under the driver_name_lock with irqs disabled. So + * create a temporary copy first. + */ + read_lock_irqsave(&driver_name_lock, flags); + len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); + read_unlock_irqrestore(&driver_name_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t filter_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN]; + unsigned long flags; + size_t len; + int i; + + /* + * We can't copy from userspace directly. Access to + * current_driver_name is protected with a write_lock with irqs + * disabled. Since copy_from_user can fault and may sleep we + * need to copy to temporary buffer first + */ + len = min(count, (size_t)(NAME_MAX_LEN - 1)); + if (copy_from_user(buf, userbuf, len)) + return -EFAULT; + + buf[len] = 0; + + write_lock_irqsave(&driver_name_lock, flags); + + /* + * Now handle the string we got from userspace very carefully. + * The rules are: + * - only use the first token we got + * - token delimiter is everything looking like a space + * character (' ', '\n', '\t' ...) + * + */ + if (!isalnum(buf[0])) { + /* + * If the first character userspace gave us is not + * alphanumerical then assume the filter should be + * switched off. + */ + if (current_driver_name[0]) + pr_info("DMA-API: switching off dma-debug driver filter\n"); + current_driver_name[0] = 0; + current_driver = NULL; + goto out_unlock; + } + + /* + * Now parse out the first token and use it as the name for the + * driver to filter for. + */ + for (i = 0; i < NAME_MAX_LEN - 1; ++i) { + current_driver_name[i] = buf[i]; + if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) + break; + } + current_driver_name[i] = 0; + current_driver = NULL; + + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + +out_unlock: + write_unlock_irqrestore(&driver_name_lock, flags); + + return count; +} + +static const struct file_operations filter_fops = { + .read = filter_read, + .write = filter_write, + .llseek = default_llseek, +}; + +static int dma_debug_fs_init(void) +{ + dma_debug_dent = debugfs_create_dir("dma-api", NULL); + if (!dma_debug_dent) { + pr_err("DMA-API: can not create debugfs directory\n"); + return -ENOMEM; + } + + global_disable_dent = debugfs_create_bool("disabled", 0444, + dma_debug_dent, + &global_disable); + if (!global_disable_dent) + goto out_err; + + error_count_dent = debugfs_create_u32("error_count", 0444, + dma_debug_dent, &error_count); + if (!error_count_dent) + goto out_err; + + show_all_errors_dent = debugfs_create_u32("all_errors", 0644, + dma_debug_dent, + &show_all_errors); + if (!show_all_errors_dent) + goto out_err; + + show_num_errors_dent = debugfs_create_u32("num_errors", 0644, + dma_debug_dent, + &show_num_errors); + if (!show_num_errors_dent) + goto out_err; + + num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, + dma_debug_dent, + &num_free_entries); + if (!num_free_entries_dent) + goto out_err; + + min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, + dma_debug_dent, + &min_free_entries); + if (!min_free_entries_dent) + goto out_err; + + filter_dent = debugfs_create_file("driver_filter", 0644, + dma_debug_dent, NULL, &filter_fops); + if (!filter_dent) + goto out_err; + + return 0; + +out_err: + debugfs_remove_recursive(dma_debug_dent); + + return -ENOMEM; +} + +static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) +{ + struct dma_debug_entry *entry; + unsigned long flags; + int count = 0, i; + + for (i = 0; i < HASH_SIZE; ++i) { + spin_lock_irqsave(&dma_entry_hash[i].lock, flags); + list_for_each_entry(entry, &dma_entry_hash[i].list, list) { + if (entry->dev == dev) { + count += 1; + *out_entry = entry; + } + } + spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); + } + + return count; +} + +static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) +{ + struct device *dev = data; + struct dma_debug_entry *uninitialized_var(entry); + int count; + + if (dma_debug_disabled()) + return 0; + + switch (action) { + case BUS_NOTIFY_UNBOUND_DRIVER: + count = device_dma_allocations(dev, &entry); + if (count == 0) + break; + err_printk(dev, entry, "DMA-API: device driver has pending " + "DMA allocations while released from device " + "[count=%d]\n" + "One of leaked entries details: " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [mapped as %s]\n", + count, entry->dev_addr, entry->size, + dir2name[entry->direction], type2name[entry->type]); + break; + default: + break; + } + + return 0; +} + +void dma_debug_add_bus(struct bus_type *bus) +{ + struct notifier_block *nb; + + if (dma_debug_disabled()) + return; + + nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); + if (nb == NULL) { + pr_err("dma_debug_add_bus: out of memory\n"); + return; + } + + nb->notifier_call = dma_debug_device_change; + + bus_register_notifier(bus, nb); +} + +static int dma_debug_init(void) +{ + int i; + + /* Do not use dma_debug_initialized here, since we really want to be + * called to set dma_debug_initialized + */ + if (global_disable) + return 0; + + for (i = 0; i < HASH_SIZE; ++i) { + INIT_LIST_HEAD(&dma_entry_hash[i].list); + spin_lock_init(&dma_entry_hash[i].lock); + } + + if (dma_debug_fs_init() != 0) { + pr_err("DMA-API: error creating debugfs entries - disabling\n"); + global_disable = true; + + return 0; + } + + if (prealloc_memory(nr_prealloc_entries) != 0) { + pr_err("DMA-API: debugging out of memory error - disabled\n"); + global_disable = true; + + return 0; + } + + nr_total_entries = num_free_entries; + + dma_debug_initialized = true; + + pr_info("DMA-API: debugging enabled by kernel config\n"); + return 0; +} +core_initcall(dma_debug_init); + +static __init int dma_debug_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (strncmp(str, "off", 3) == 0) { + pr_info("DMA-API: debugging disabled on kernel command line\n"); + global_disable = true; + } + + return 0; +} + +static __init int dma_debug_entries_cmdline(char *str) +{ + if (!str) + return -EINVAL; + if (!get_option(&str, &nr_prealloc_entries)) + nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + return 0; +} + +__setup("dma_debug=", dma_debug_cmdline); +__setup("dma_debug_entries=", dma_debug_entries_cmdline); + +static void check_unmap(struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + + if (!entry) { + /* must drop lock before calling dma_mapping_error */ + put_hash_bucket(bucket, &flags); + + if (dma_mapping_error(ref->dev, ref->dev_addr)) { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free an " + "invalid DMA memory address\n"); + } else { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free DMA " + "memory it has not allocated [device " + "address=0x%016llx] [size=%llu bytes]\n", + ref->dev_addr, ref->size); + } + return; + } + + if (ref->size != entry->size) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different size " + "[device address=0x%016llx] [map size=%llu bytes] " + "[unmap size=%llu bytes]\n", + ref->dev_addr, entry->size, ref->size); + } + + if (ref->type != entry->type) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with wrong function " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s] [unmapped as %s]\n", + ref->dev_addr, ref->size, + type2name[entry->type], type2name[ref->type]); + } else if ((entry->type == dma_debug_coherent) && + (phys_addr(ref) != phys_addr(entry))) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different CPU address " + "[device address=0x%016llx] [size=%llu bytes] " + "[cpu alloc address=0x%016llx] " + "[cpu free address=0x%016llx]", + ref->dev_addr, ref->size, + phys_addr(entry), + phys_addr(ref)); + } + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA sg list with different entry count " + "[map count=%d] [unmap count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + + /* + * This may be no bug in reality - but most implementations of the + * DMA API don't handle this properly, so check for it here + */ + if (ref->direction != entry->direction) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [unmapped with %s]\n", + ref->dev_addr, ref->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + err_printk(ref->dev, entry, + "DMA-API: device driver failed to check map error" + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s]", + ref->dev_addr, ref->size, + type2name[entry->type]); + } + + hash_bucket_del(entry); + dma_entry_free(entry); + + put_hash_bucket(bucket, &flags); +} + +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) +{ + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } +} + +static inline bool overlap(void *addr, unsigned long len, void *start, void *end) +{ + unsigned long a1 = (unsigned long)addr; + unsigned long b1 = a1 + len; + unsigned long a2 = (unsigned long)start; + unsigned long b2 = (unsigned long)end; + + return !(b1 <= a2 || a1 >= b2); +} + +static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) +{ + if (overlap(addr, len, _stext, _etext) || + overlap(addr, len, __start_rodata, __end_rodata)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); +} + +static void check_sync(struct device *dev, + struct dma_debug_entry *ref, + bool to_cpu) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + + entry = bucket_find_contain(&bucket, ref, &flags); + + if (!entry) { + err_printk(dev, NULL, "DMA-API: device driver tries " + "to sync DMA memory it has not allocated " + "[device address=0x%016llx] [size=%llu bytes]\n", + (unsigned long long)ref->dev_addr, ref->size); + goto out; + } + + if (ref->size > entry->size) { + err_printk(dev, entry, "DMA-API: device driver syncs" + " DMA memory outside allocated range " + "[device address=0x%016llx] " + "[allocation size=%llu bytes] " + "[sync offset+size=%llu]\n", + entry->dev_addr, entry->size, + ref->size); + } + + if (entry->direction == DMA_BIDIRECTIONAL) + goto out; + + if (ref->direction != entry->direction) { + err_printk(dev, entry, "DMA-API: device driver syncs " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && + !(ref->direction == DMA_TO_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device read-only DMA memory for cpu " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && + !(ref->direction == DMA_FROM_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device write-only DMA memory to device " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver syncs " + "DMA sg list with different entry count " + "[map count=%d] [sync count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + +out: + put_hash_bucket(bucket, &flags); +} + +static void check_sg_segment(struct device *dev, struct scatterlist *sg) +{ +#ifdef CONFIG_DMA_API_DEBUG_SG + unsigned int max_seg = dma_get_max_seg_size(dev); + u64 start, end, boundary = dma_get_seg_boundary(dev); + + /* + * Either the driver forgot to set dma_parms appropriately, or + * whoever generated the list forgot to check them. + */ + if (sg->length > max_seg) + err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", + sg->length, max_seg); + /* + * In some cases this could potentially be the DMA API + * implementation's fault, but it would usually imply that + * the scatterlist was built inappropriately to begin with. + */ + start = sg_dma_address(sg); + end = start + sg_dma_len(sg) - 1; + if ((start ^ end) & ~boundary) + err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", + start, end, boundary); +#endif +} + +void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, + size_t size, int direction, dma_addr_t dma_addr, + bool map_single) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (dma_mapping_error(dev, dma_addr)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->dev = dev; + entry->type = dma_debug_page; + entry->pfn = page_to_pfn(page); + entry->offset = offset, + entry->dev_addr = dma_addr; + entry->size = size; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + if (map_single) + entry->type = dma_debug_single; + + check_for_stack(dev, page, offset); + + if (!PageHighMem(page)) { + void *addr = page_address(page) + offset; + + check_for_illegal_area(dev, addr, size); + } + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_page); + +void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + struct dma_debug_entry ref; + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + if (unlikely(dma_debug_disabled())) + return; + + ref.dev = dev; + ref.dev_addr = dma_addr; + bucket = get_hash_bucket(&ref, &flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!exact_match(&ref, entry)) + continue; + + /* + * The same physical address can be mapped multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which updates the first entry + * from the hash which fits the reference value and is + * not currently listed as being checked. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + entry->map_err_type = MAP_ERR_CHECKED; + break; + } + } + + put_hash_bucket(bucket, &flags); +} +EXPORT_SYMBOL(debug_dma_mapping_error); + +void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction, bool map_single) +{ + struct dma_debug_entry ref = { + .type = dma_debug_page, + .dev = dev, + .dev_addr = addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + if (map_single) + ref.type = dma_debug_single; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_page); + +void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ + struct dma_debug_entry *entry; + struct scatterlist *s; + int i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, mapped_ents, i) { + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_sg; + entry->dev = dev; + entry->pfn = page_to_pfn(sg_page(s)); + entry->offset = s->offset, + entry->size = sg_dma_len(s); + entry->dev_addr = sg_dma_address(s); + entry->direction = direction; + entry->sg_call_ents = nents; + entry->sg_mapped_ents = mapped_ents; + + check_for_stack(dev, sg_page(s), s->offset); + + if (!PageHighMem(sg_page(s))) { + check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); + } + + check_sg_segment(dev, s); + + add_dma_entry(entry); + } +} +EXPORT_SYMBOL(debug_dma_map_sg); + +static int get_nr_mapped_entries(struct device *dev, + struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + int mapped_ents; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + mapped_ents = 0; + + if (entry) + mapped_ents = entry->sg_mapped_ents; + put_hash_bucket(bucket, &flags); + + return mapped_ents; +} + +void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sglist, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = dir, + .sg_call_ents = nelems, + }; + + if (mapped_ents && i >= mapped_ents) + break; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + check_unmap(&ref); + } +} +EXPORT_SYMBOL(debug_dma_unmap_sg); + +void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (unlikely(virt == NULL)) + return; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_coherent; + entry->dev = dev; + entry->offset = offset_in_page(virt); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = DMA_BIDIRECTIONAL; + + if (is_vmalloc_addr(virt)) + entry->pfn = vmalloc_to_pfn(virt); + else + entry->pfn = page_to_pfn(virt_to_page(virt)); + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_alloc_coherent); + +void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_coherent, + .dev = dev, + .offset = offset_in_page(virt), + .dev_addr = addr, + .size = size, + .direction = DMA_BIDIRECTIONAL, + }; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + if (is_vmalloc_addr(virt)) + ref.pfn = vmalloc_to_pfn(virt); + else + ref.pfn = page_to_pfn(virt_to_page(virt)); + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_free_coherent); + +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + +void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); + +void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_device); + +void debug_dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); + +void debug_dma_sync_single_range_for_device(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); + +void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, true); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); + +void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, false); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_device); + +static int __init dma_debug_driver_setup(char *str) +{ + int i; + + for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { + current_driver_name[i] = *str; + if (*str == 0) + break; + } + + if (current_driver_name[0]) + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + + + return 1; +} +__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c new file mode 100644 index 000000000000..8be8106270c2 --- /dev/null +++ b/kernel/dma/direct.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map physical memory directly without using an IOMMU or + * flushing caches. + */ +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/scatterlist.h> +#include <linux/dma-contiguous.h> +#include <linux/pfn.h> +#include <linux/set_memory.h> + +#define DIRECT_MAPPING_ERROR 0 + +/* + * Most architectures use ZONE_DMA for the first 16 Megabytes, but + * some use it for entirely different regions: + */ +#ifndef ARCH_ZONE_DMA_BITS +#define ARCH_ZONE_DMA_BITS 24 +#endif + +/* + * For AMD SEV all DMA must be to unencrypted addresses. + */ +static inline bool force_dma_unencrypted(void) +{ + return sev_active(); +} + +static bool +check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, + const char *caller) +{ + if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + if (!dev->dma_mask) { + dev_err(dev, + "%s: call on device without dma_mask\n", + caller); + return false; + } + + if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + dev_err(dev, + "%s: overflow %pad+%zu of device mask %llx\n", + caller, &dma_addr, size, *dev->dma_mask); + } + return false; + } + return true; +} + +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +{ + dma_addr_t addr = force_dma_unencrypted() ? + __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); + return addr + size - 1 <= dev->coherent_dma_mask; +} + +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + int page_order = get_order(size); + struct page *page = NULL; + void *ret; + + /* we always manually zero the memory once we are done: */ + gfp &= ~__GFP_ZERO; + + /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ + if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + gfp |= GFP_DMA; + if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) + gfp |= GFP_DMA32; + +again: + /* CMA can be used only in the context which permits sleeping */ + if (gfpflags_allow_blocking(gfp)) { + page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, page_order); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __free_pages(page, page_order); + page = NULL; + + if (IS_ENABLED(CONFIG_ZONE_DMA32) && + dev->coherent_dma_mask < DMA_BIT_MASK(64) && + !(gfp & (GFP_DMA32 | GFP_DMA))) { + gfp |= GFP_DMA32; + goto again; + } + + if (IS_ENABLED(CONFIG_ZONE_DMA) && + dev->coherent_dma_mask < DMA_BIT_MASK(32) && + !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + } + + if (!page) + return NULL; + ret = page_address(page); + if (force_dma_unencrypted()) { + set_memory_decrypted((unsigned long)ret, 1 << page_order); + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + } else { + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + } + memset(ret, 0, size); + return ret; +} + +/* + * NOTE: this function must never look at the dma_addr argument, because we want + * to be able to use it as a helper for iommu implementations as well. + */ +void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned int page_order = get_order(size); + + if (force_dma_unencrypted()) + set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) + free_pages((unsigned long)cpu_addr, page_order); +} + +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; + + if (!check_addr(dev, dma_addr, size, __func__)) + return DIRECT_MAPPING_ERROR; + return dma_addr; +} + +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + + sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); + if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) + return 0; + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +int dma_direct_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_ZONE_DMA + if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return 0; +#else + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask < DMA_BIT_MASK(32)) + return 0; +#endif + /* + * Various PCI/PCIe bridges have broken support for > 32bit DMA even + * if the device itself might support it. + */ + if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + return 0; + return 1; +} + +int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == DIRECT_MAPPING_ERROR; +} + +const struct dma_map_ops dma_direct_ops = { + .alloc = dma_direct_alloc, + .free = dma_direct_free, + .map_page = dma_direct_map_page, + .map_sg = dma_direct_map_sg, + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, +}; +EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c new file mode 100644 index 000000000000..d2a92ddaac4d --- /dev/null +++ b/kernel/dma/mapping.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch-independent dma-mapping routines + * + * Copyright (c) 2006 SUSE Linux Products GmbH + * Copyright (c) 2006 Tejun Heo <teheo@suse.de> + */ + +#include <linux/acpi.h> +#include <linux/dma-mapping.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/of_device.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/* + * Managed DMA API + */ +struct dma_devres { + size_t size; + void *vaddr; + dma_addr_t dma_handle; + unsigned long attrs; +}; + +static void dmam_release(struct device *dev, void *res) +{ + struct dma_devres *this = res; + + dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, + this->attrs); +} + +static int dmam_match(struct device *dev, void *res, void *match_data) +{ + struct dma_devres *this = res, *match = match_data; + + if (this->vaddr == match->vaddr) { + WARN_ON(this->size != match->size || + this->dma_handle != match->dma_handle); + return 1; + } + return 0; +} + +/** + * dmam_alloc_coherent - Managed dma_alloc_coherent() + * @dev: Device to allocate coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * + * Managed dma_alloc_coherent(). Memory allocated using this function + * will be automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_coherent); + +/** + * dmam_free_coherent - Managed dma_free_coherent() + * @dev: Device to free coherent memory for + * @size: Size of allocation + * @vaddr: Virtual address of the memory to free + * @dma_handle: DMA handle of the memory to free + * + * Managed dma_free_coherent(). + */ +void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle) +{ + struct dma_devres match_data = { size, vaddr, dma_handle }; + + dma_free_coherent(dev, size, vaddr, dma_handle); + WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); +} +EXPORT_SYMBOL(dmam_free_coherent); + +/** + * dmam_alloc_attrs - Managed dma_alloc_attrs() + * @dev: Device to allocate non_coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * @attrs: Flags in the DMA_ATTR_* namespace. + * + * Managed dma_alloc_attrs(). Memory allocated using this function will be + * automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + dr->attrs = attrs; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_attrs); + +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT + +static void dmam_coherent_decl_release(struct device *dev, void *res) +{ + dma_release_declared_memory(dev); +} + +/** + * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() + * @dev: Device to declare coherent memory for + * @phys_addr: Physical address of coherent memory to be declared + * @device_addr: Device address of coherent memory to be declared + * @size: Size of coherent memory to be declared + * @flags: Flags + * + * Managed dma_declare_coherent_memory(). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void *res; + int rc; + + res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); + if (!res) + return -ENOMEM; + + rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, + flags); + if (!rc) + devres_add(dev, res); + else + devres_free(res); + + return rc; +} +EXPORT_SYMBOL(dmam_declare_coherent_memory); + +/** + * dmam_release_declared_memory - Managed dma_release_declared_memory(). + * @dev: Device to release declared coherent memory for + * + * Managed dmam_release_declared_memory(). + */ +void dmam_release_declared_memory(struct device *dev) +{ + WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); +} +EXPORT_SYMBOL(dmam_release_declared_memory); + +#endif + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t handle, size_t size) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return 0; +} +EXPORT_SYMBOL(dma_common_get_sgtable); + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size) +{ + int ret = -ENXIO; +#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off < count && user_count <= (count - off)) + ret = remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + off, + user_count << PAGE_SHIFT, + vma->vm_page_prot); +#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ + + return ret; +} +EXPORT_SYMBOL(dma_common_mmap); + +#ifdef CONFIG_MMU +static struct vm_struct *__dma_common_pages_remap(struct page **pages, + size_t size, unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, vm_flags, caller); + if (!area) + return NULL; + + if (map_vm_area(area, prot, pages)) { + vunmap(area->addr); + return NULL; + } + + return area; +} + +/* + * remaps an array of PAGE_SIZE pages into another vm_area + * Cannot be used in non-sleeping contexts + */ +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + if (!area) + return NULL; + + area->pages = pages; + + return area->addr; +} + +/* + * remaps an allocated contiguous region into another vm_area. + * Cannot be used in non-sleeping contexts + */ + +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller) +{ + int i; + struct page **pages; + struct vm_struct *area; + + pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + if (!pages) + return NULL; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = nth_page(page, i); + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + + kfree(pages); + + if (!area) + return NULL; + return area->addr; +} + +/* + * unmaps a range previously mapped by dma_common_*_remap + */ +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || (area->flags & vm_flags) != vm_flags) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; + } + + unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); + vunmap(cpu_addr); +} +#endif + +/* + * enables DMA API use for a device + */ +int dma_configure(struct device *dev) +{ + if (dev->bus->dma_configure) + return dev->bus->dma_configure(dev); + return 0; +} + +void dma_deconfigure(struct device *dev) +{ + of_dma_deconfigure(dev); + acpi_dma_deconfigure(dev); +} diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c new file mode 100644 index 000000000000..79e9a757387f --- /dev/null +++ b/kernel/dma/noncoherent.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without providing cache + * coherence. + */ +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/scatterlist.h> + +static void dma_noncoherent_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); +} + +static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t addr; + + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(dev, page_to_phys(page) + offset, + size, dir); + return addr; +} + +static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); + if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); + return nents; +} + +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +static void dma_noncoherent_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); +} + +static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); +} + +static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); +} +#endif + +const struct dma_map_ops dma_noncoherent_ops = { + .alloc = arch_dma_alloc, + .free = arch_dma_free, + .mmap = arch_dma_mmap, + .sync_single_for_device = dma_noncoherent_sync_single_for_device, + .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, + .map_page = dma_noncoherent_map_page, + .map_sg = dma_noncoherent_map_sg, +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU + .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, + .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, + .unmap_page = dma_noncoherent_unmap_page, + .unmap_sg = dma_noncoherent_unmap_sg, +#endif + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, + .cache_sync = arch_dma_cache_sync, +}; +EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c new file mode 100644 index 000000000000..04b68d9dffac --- /dev/null +++ b/kernel/dma/swiotlb.c @@ -0,0 +1,1087 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is a fallback for platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@intel.com> + * Copyright (C) 2000 Goutham Rao <goutham.rao@intel.com> + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@hpl.hp.com> + * + * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. + * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid + * unnecessary i-cache flushing. + * 04/07/.. ak Better overflow handling. Assorted fixes. + * 05/09/10 linville Add support for syncing ranges, support syncing for + * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. + * 08/12/11 beckyb Add highmem support + */ + +#include <linux/cache.h> +#include <linux/dma-direct.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/swiotlb.h> +#include <linux/pfn.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/scatterlist.h> +#include <linux/mem_encrypt.h> +#include <linux/set_memory.h> + +#include <asm/io.h> +#include <asm/dma.h> + +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/iommu-helper.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/swiotlb.h> + +#define OFFSET(val,align) ((unsigned long) \ + ( (val) & ( (align) - 1))) + +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) + +/* + * Minimum IO TLB size to bother booting with. Systems with mainly + * 64bit capable cards will only lightly use the swiotlb. If we can't + * allocate a contiguous 1MB, we're probably in trouble anyway. + */ +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + +enum swiotlb_force swiotlb_force; + +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static phys_addr_t io_tlb_start, io_tlb_end; + +/* + * The number of IO TLB blocks (in groups of 64) between io_tlb_start and + * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. + */ +static unsigned long io_tlb_nslabs; + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +static phys_addr_t io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * Max segment that we can provide which (if pages are contingous) will + * not be bounced (unless SWIOTLB_FORCE is set). + */ +unsigned int max_segment; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) +static phys_addr_t *io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static int late_alloc; + +static int __init +setup_io_tlb_npages(char *str) +{ + if (isdigit(*str)) { + io_tlb_nslabs = simple_strtoul(str, &str, 0); + /* avoid tail segment of size < IO_TLB_SEGSIZE */ + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + if (*str == ',') + ++str; + if (!strcmp(str, "force")) { + swiotlb_force = SWIOTLB_FORCE; + } else if (!strcmp(str, "noforce")) { + swiotlb_force = SWIOTLB_NO_FORCE; + io_tlb_nslabs = 1; + } + + return 0; +} +early_param("swiotlb", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +unsigned long swiotlb_nr_tbl(void) +{ + return io_tlb_nslabs; +} +EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +unsigned int swiotlb_max_segment(void) +{ + return max_segment; +} +EXPORT_SYMBOL_GPL(swiotlb_max_segment); + +void swiotlb_set_max_segment(unsigned int val) +{ + if (swiotlb_force == SWIOTLB_FORCE) + max_segment = 1; + else + max_segment = rounddown(val, PAGE_SIZE); +} + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + +static bool no_iotlb_memory; + +void swiotlb_print_info(void) +{ + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + unsigned char *vstart, *vend; + + if (no_iotlb_memory) { + pr_warn("software IO TLB: No low mem\n"); + return; + } + + vstart = phys_to_virt(io_tlb_start); + vend = phys_to_virt(io_tlb_end); + + printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + (unsigned long long)io_tlb_start, + (unsigned long long)io_tlb_end, + bytes >> 20, vstart, vend - 1); +} + +/* + * Early SWIOTLB allocation may be too early to allow an architecture to + * perform the desired operations. This function allows the architecture to + * call SWIOTLB when the operations are possible. It needs to be called + * before the SWIOTLB memory is used. + */ +void __init swiotlb_update_mem_attributes(void) +{ + void *vaddr; + unsigned long bytes; + + if (no_iotlb_memory || late_alloc) + return; + + vaddr = phys_to_virt(io_tlb_start); + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); + + vaddr = phys_to_virt(io_tlb_overflow_buffer); + bytes = PAGE_ALIGN(io_tlb_overflow); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); +} + +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +{ + void *v_overflow_buffer; + unsigned long i, bytes; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = __pa(tlb); + io_tlb_end = io_tlb_start + bytes; + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = memblock_virt_alloc_low_nopanic( + PAGE_ALIGN(io_tlb_overflow), + PAGE_SIZE); + if (!v_overflow_buffer) + return -ENOMEM; + + io_tlb_overflow_buffer = __pa(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), + PAGE_SIZE); + io_tlb_orig_addr = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), + PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + if (verbose) + swiotlb_print_info(); + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + return 0; +} + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init(int verbose) +{ + size_t default_size = IO_TLB_DEFAULT_SIZE; + unsigned char *vstart; + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* Get IO TLB memory from the low pages */ + vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + return; + + if (io_tlb_start) + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + pr_warn("Cannot allocate SWIOTLB buffer"); + no_iotlb_memory = true; +} + +/* + * Systems with larger DMA zones (those that don't support ISA) can + * initialize the swiotlb later using the slab allocator if needed. + * This should be just like above, but with some error catching. + */ +int +swiotlb_late_init_with_default_size(size_t default_size) +{ + unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned char *vstart = NULL; + unsigned int order; + int rc = 0; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + /* + * Get IO TLB memory from the low pages + */ + order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); + io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); + if (vstart) + break; + order--; + } + + if (!vstart) { + io_tlb_nslabs = req_nslabs; + return -ENOMEM; + } + if (order != get_order(bytes)) { + printk(KERN_WARNING "Warning: only able to allocate %ld MB " + "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + io_tlb_nslabs = SLABS_PER_PAGE << order; + } + rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + if (rc) + free_pages((unsigned long)vstart, order); + + return rc; +} + +int +swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) +{ + unsigned long i, bytes; + unsigned char *v_overflow_buffer; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = virt_to_phys(tlb); + io_tlb_end = io_tlb_start + bytes; + + set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); + memset(tlb, 0, bytes); + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); + if (!v_overflow_buffer) + goto cleanup2; + + set_memory_decrypted((unsigned long)v_overflow_buffer, + io_tlb_overflow >> PAGE_SHIFT); + memset(v_overflow_buffer, 0, io_tlb_overflow); + io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * sizeof(int))); + if (!io_tlb_list) + goto cleanup3; + + io_tlb_orig_addr = (phys_addr_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + if (!io_tlb_orig_addr) + goto cleanup4; + + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + swiotlb_print_info(); + + late_alloc = 1; + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + + return 0; + +cleanup4: + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + io_tlb_list = NULL; +cleanup3: + free_pages((unsigned long)v_overflow_buffer, + get_order(io_tlb_overflow)); + io_tlb_overflow_buffer = 0; +cleanup2: + io_tlb_end = 0; + io_tlb_start = 0; + io_tlb_nslabs = 0; + max_segment = 0; + return -ENOMEM; +} + +void __init swiotlb_exit(void) +{ + if (!io_tlb_orig_addr) + return; + + if (late_alloc) { + free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), + get_order(io_tlb_overflow)); + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + free_pages((unsigned long)phys_to_virt(io_tlb_start), + get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + } else { + memblock_free_late(io_tlb_overflow_buffer, + PAGE_ALIGN(io_tlb_overflow)); + memblock_free_late(__pa(io_tlb_orig_addr), + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_list), + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + memblock_free_late(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + } + io_tlb_nslabs = 0; + max_segment = 0; +} + +int is_swiotlb_buffer(phys_addr_t paddr) +{ + return paddr >= io_tlb_start && paddr < io_tlb_end; +} + +/* + * Bounce: copy the swiotlb buffer back to the original dma location + */ +static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + unsigned long pfn = PFN_DOWN(orig_addr); + unsigned char *vaddr = phys_to_virt(tlb_addr); + + if (PageHighMem(pfn_to_page(pfn))) { + /* The buffer does not have a mapping. Map it in and copy */ + unsigned int offset = orig_addr & ~PAGE_MASK; + char *buffer; + unsigned int sz = 0; + unsigned long flags; + + while (size) { + sz = min_t(size_t, PAGE_SIZE - offset, size); + + local_irq_save(flags); + buffer = kmap_atomic(pfn_to_page(pfn)); + if (dir == DMA_TO_DEVICE) + memcpy(vaddr, buffer + offset, sz); + else + memcpy(buffer + offset, vaddr, sz); + kunmap_atomic(buffer); + local_irq_restore(flags); + + size -= sz; + pfn++; + vaddr += sz; + offset = 0; + } + } else if (dir == DMA_TO_DEVICE) { + memcpy(vaddr, phys_to_virt(orig_addr), size); + } else { + memcpy(phys_to_virt(orig_addr), vaddr, size); + } +} + +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, + dma_addr_t tbl_dma_addr, + phys_addr_t orig_addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + phys_addr_t tlb_addr; + unsigned int nslots, stride, index, wrap; + int i; + unsigned long mask; + unsigned long offset_slots; + unsigned long max_slots; + + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + + if (mem_encrypt_active()) + pr_warn_once("%s is active and system is using DMA bounce buffers\n", + sme_active() ? "SME" : "SEV"); + + mask = dma_get_seg_boundary(hwdev); + + tbl_dma_addr &= mask; + + offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + + /* + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ + max_slots = mask + 1 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + + /* + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size >= PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + index = ALIGN(io_tlb_index, stride); + if (index >= io_tlb_nslabs) + index = 0; + wrap = index; + + do { + while (iommu_is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + if (index == wrap) + goto not_found; + } + + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int) (index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in the next + * round. + */ + io_tlb_index = ((index + nslots) < io_tlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + return SWIOTLB_MAP_ERROR; +found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + + return tlb_addr; +} + +/* + * Allocates bounce buffer and returns its physical address. + */ +static phys_addr_t +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t start_dma_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) { + dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", + &phys); + return SWIOTLB_MAP_ERROR; + } + + start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, + dir, attrs); +} + +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + /* + * First, sync the memory before unmapping the entry + */ + if (orig_addr != INVALID_PHYS_ADDR && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contiguous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + if (orig_addr == INVALID_PHYS_ADDR) + return; + orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + + switch (target) { + case SYNC_FOR_CPU: + if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); + break; + case SYNC_FOR_DEVICE: + if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); + break; + default: + BUG(); + } +} + +static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, + size_t size) +{ + u64 mask = DMA_BIT_MASK(32); + + if (dev && dev->coherent_dma_mask) + mask = dev->coherent_dma_mask; + return addr + size - 1 <= mask; +} + +static void * +swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, + unsigned long attrs) +{ + phys_addr_t phys_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) + goto out_warn; + + phys_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + 0, size, DMA_FROM_DEVICE, attrs); + if (phys_addr == SWIOTLB_MAP_ERROR) + goto out_warn; + + *dma_handle = __phys_to_dma(dev, phys_addr); + if (!dma_coherent_ok(dev, *dma_handle, size)) + goto out_unmap; + + memset(phys_to_virt(phys_addr), 0, size); + return phys_to_virt(phys_addr); + +out_unmap: + dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", + (unsigned long long)dev->coherent_dma_mask, + (unsigned long long)*dma_handle); + + /* + * DMA_TO_DEVICE to avoid memcpy in unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); +out_warn: + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { + dev_warn(dev, + "swiotlb: coherent allocation failed, size=%zu\n", + size); + dump_stack(); + } + return NULL; +} + +static bool swiotlb_free_buffer(struct device *dev, size_t size, + dma_addr_t dma_addr) +{ + phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); + + WARN_ON_ONCE(irqs_disabled()); + + if (!is_swiotlb_buffer(phys_addr)) + return false; + + /* + * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + return true; +} + +static void +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) +{ + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", + size); + + if (size <= io_tlb_overflow || !do_panic) + return; + + if (dir == DMA_BIDIRECTIONAL) + panic("DMA: Random memory could be DMA accessed\n"); + if (dir == DMA_FROM_DEVICE) + panic("DMA: Random memory could be DMA written\n"); + if (dir == DMA_TO_DEVICE) + panic("DMA: Random memory could be DMA read\n"); +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. + */ +dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t map, phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = phys_to_dma(dev, phys); + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) + return dev_addr; + + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + + /* Oh well, have to allocate and map a bounce buffer. */ + map = map_single(dev, phys, size, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + swiotlb_full(dev, size, dir, 1); + return __phys_to_dma(dev, io_tlb_overflow_buffer); + } + + dev_addr = __phys_to_dma(dev, map); + + /* Ensure that the address returned is DMA'ble */ + if (dma_capable(dev, dev_addr, size)) + return dev_addr; + + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + + return __phys_to_dma(dev, io_tlb_overflow_buffer); +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unmap_single(hwdev, dev_addr, size, dir, attrs); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_page are the + * same here. + */ +int +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); + + if (swiotlb_force == SWIOTLB_FORCE || + !dma_capable(hwdev, dev_addr, sg->length)) { + phys_addr_t map = map_single(hwdev, sg_phys(sg), + sg->length, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sg_dma_len(sgl) = 0; + return 0; + } + sg->dma_address = __phys_to_dma(hwdev, map); + } else + sg->dma_address = dev_addr; + sg_dma_len(sg) = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + attrs); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, + sg_dma_len(sg), dir, target); +} + +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} + +int +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); +} + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; +} + +void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + void *vaddr; + + /* temporary workaround: */ + if (gfp & __GFP_NOWARN) + attrs |= DMA_ATTR_NO_WARN; + + /* + * Don't print a warning when the first allocation attempt fails. + * swiotlb_alloc_coherent() will print a warning when the DMA memory + * allocation ultimately failed. + */ + gfp |= __GFP_NOWARN; + + vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); + if (!vaddr) + vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); + return vaddr; +} + +void swiotlb_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, unsigned long attrs) +{ + if (!swiotlb_free_buffer(dev, size, dma_addr)) + dma_direct_free(dev, size, vaddr, dma_addr, attrs); +} + +const struct dma_map_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc = swiotlb_alloc, + .free = swiotlb_free, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .dma_supported = dma_direct_supported, +}; diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c new file mode 100644 index 000000000000..631ddec4b60a --- /dev/null +++ b/kernel/dma/virt.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map to virtual addresses without flushing memory. + */ +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/dma-mapping.h> +#include <linux/scatterlist.h> + +static void *dma_virt_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + void *ret; + + ret = (void *)__get_free_pages(gfp, get_order(size)); + if (ret) + *dma_handle = (uintptr_t)ret; + return ret; +} + +static void dma_virt_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, + unsigned long attrs) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + return (uintptr_t)(page_address(page) + offset); +} + +static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg_dma_address(sg) = (uintptr_t)sg_virt(sg); + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +const struct dma_map_ops dma_virt_ops = { + .alloc = dma_virt_alloc, + .free = dma_virt_free, + .map_page = dma_virt_map_page, + .map_sg = dma_virt_map_sg, +}; +EXPORT_SYMBOL(dma_virt_ops); diff --git a/kernel/events/core.c b/kernel/events/core.c index 08f5e1b42b43..80cca2b30c4f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd) return file; } +const struct perf_event *perf_get_event(struct file *file) +{ + if (file->f_op != &perf_fops) + return ERR_PTR(-EINVAL); + + return file->private_data; +} + const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { if (!event) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 1d8ca9ea9979..045a37e9ddee 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -614,7 +614,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, } } - rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, + node); if (!rb->aux_pages) return -ENOMEM; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1725b902983f..ccc579a7d32e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1184,7 +1184,8 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (unlikely(!area)) goto out; - area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); + area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long), + GFP_KERNEL); if (!area->bitmap) goto free_area; diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 1d5632d8bbcc..5349c91c2298 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -258,7 +258,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + buf = kmalloc(count + 1, GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/kernel/fork.c b/kernel/fork.c index a5d21c42acfc..9440d61b925c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -440,6 +440,14 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, continue; } charge = 0; + /* + * Don't duplicate many vmas if we've been oom-killed (for + * example) + */ + if (fatal_signal_pending(current)) { + retval = -EINTR; + goto out; + } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -811,7 +819,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif @@ -899,6 +907,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); @@ -1712,7 +1721,7 @@ static __latent_entropy struct task_struct *copy_process( p->start_time = ktime_get_ns(); p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; - p->audit_context = NULL; + audit_set_context(p, NULL); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1899,6 +1908,8 @@ static __latent_entropy struct task_struct *copy_process( */ copy_seccomp(p); + rseq_fork(p, clone_flags); + /* * Process group and session signals need to be delivered to just the * parent before the fork or both the parent and the child after the diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1276aabaab55..1e3823fa799b 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -53,23 +53,16 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL - default GCOV_FORMAT_AUTODETECT ---help--- - The gcov format is usually determined by the GCC version, but there are + The gcov format is usually determined by the GCC version, and the + default is chosen according to your GCC version. However, there are exceptions where format changes are integrated in lower-version GCCs. - In such a case use this option to adjust the format used in the kernel - accordingly. - - If unsure, choose "Autodetect". - -config GCOV_FORMAT_AUTODETECT - bool "Autodetect" - ---help--- - Select this option to use the format that corresponds to your GCC - version. + In such a case, change this option to adjust the format used in the + kernel accordingly. config GCOV_FORMAT_3_4 bool "GCC 3.4 format" + depends on CC_IS_GCC && GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index c6c50e5c680e..ff06d64df397 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -4,5 +4,3 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o -obj-$(CONFIG_GCOV_FORMAT_AUTODETECT) += $(call cc-ifversion, -lt, 0407, \ - gcc_3_4.o, gcc_4_7.o) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593ed7c0b..32b479468e4d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/iomem.c b/kernel/iomem.c new file mode 100644 index 000000000000..f7525e14ebc6 --- /dev/null +++ b/kernel/iomem.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + /* In the simple case just return the existing linear address */ + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) + return __va(offset); + + return NULL; /* fallback to arch_memremap_wb */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. + * + * MEMREMAP_WB - matches the default mapping for System RAM on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + void *addr = NULL; + + if (!flags) + return NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in System RAM. + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size, flags); + if (!addr) + addr = arch_memremap_wb(offset, size); + } + + /* + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * System RAM. + */ + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) + addr = ioremap_wt(offset, size); + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4dadeb3d6666..6f636136cccc 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -55,6 +55,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE), BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), + BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), }; static void diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e3336d904f64..daeabd791d58 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -24,6 +24,7 @@ #ifdef CONFIG_IRQ_FORCED_THREADING __read_mostly bool force_irqthreads; +EXPORT_SYMBOL_GPL(force_irqthreads); static int __init setup_forced_irqthreads(char *arg) { @@ -204,6 +205,39 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + irqd_set_move_pending(data); + irq_copy_pending(desc, dest); + return 0; +} +#else +static inline int irq_set_affinity_pending(struct irq_data *data, + const struct cpumask *dest) +{ + return -EBUSY; +} +#endif + +static int irq_try_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + int ret = irq_do_set_affinity(data, dest, force); + + /* + * In case that the underlying vector management is busy and the + * architecture supports the generic pending mechanism then utilize + * this to avoid returning an error to user space. + */ + if (ret == -EBUSY && !force) + ret = irq_set_affinity_pending(data, dest); + return ret; +} + int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -214,8 +248,8 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, force); + if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { + ret = irq_try_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 86ae0eb80b53..def48589ea48 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -38,17 +38,18 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear) void irq_move_masked_irq(struct irq_data *idata) { struct irq_desc *desc = irq_data_to_desc(idata); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_data *data = &desc->irq_data; + struct irq_chip *chip = data->chip; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(data))) return; - irqd_clr_move_pending(&desc->irq_data); + irqd_clr_move_pending(data); /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (irqd_is_per_cpu(&desc->irq_data)) { + if (irqd_is_per_cpu(data)) { WARN_ON(1); return; } @@ -73,13 +74,24 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) - irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); - + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) { + int ret; + + ret = irq_do_set_affinity(data, desc->pending_mask, false); + /* + * If the there is a cleanup pending in the underlying + * vector management, reschedule the move for the next + * interrupt. Leave desc->pending_mask intact. + */ + if (ret == -EBUSY) { + irqd_set_move_pending(data); + return; + } + } cpumask_clear(desc->pending_mask); } -void irq_move_irq(struct irq_data *idata) +void __irq_move_irq(struct irq_data *idata) { bool masked; @@ -90,9 +102,6 @@ void irq_move_irq(struct irq_data *idata) */ idata = irq_desc_get_irq_data(irq_data_to_desc(idata)); - if (likely(!irqd_is_setaffinity_pending(idata))) - return; - if (unlikely(irqd_irq_disabled(idata))) return; diff --git a/kernel/kcov.c b/kernel/kcov.c index 2c16f1ab5e10..3ebd09efe72a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -58,7 +58,7 @@ struct kcov { static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) { - enum kcov_mode mode; + unsigned int mode; /* * We are interested in code coverage as a function of a syscall inputs, @@ -241,7 +241,8 @@ static void kcov_put(struct kcov *kcov) void kcov_task_init(struct task_struct *t) { - t->kcov_mode = KCOV_MODE_DISABLED; + WRITE_ONCE(t->kcov_mode, KCOV_MODE_DISABLED); + barrier(); t->kcov_size = 0; t->kcov_area = NULL; t->kcov = NULL; @@ -323,6 +324,21 @@ static int kcov_close(struct inode *inode, struct file *filep) return 0; } +/* + * Fault in a lazily-faulted vmalloc area before it can be used by + * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * vmalloc fault handling path is instrumented. + */ +static void kcov_fault_in_area(struct kcov *kcov) +{ + unsigned long stride = PAGE_SIZE / sizeof(unsigned long); + unsigned long *area = kcov->area; + unsigned long offset; + + for (offset = 0; offset < kcov->size; offset += stride) + READ_ONCE(area[offset]); +} + static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { @@ -371,6 +387,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, #endif else return -EINVAL; + kcov_fault_in_area(kcov); /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 20fef1a38602..23a83a4da38a 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -829,6 +829,8 @@ static int kimage_load_normal_segment(struct kimage *image, else buf += mchunk; mbytes -= mchunk; + + cond_resched(); } out: return result; @@ -893,6 +895,8 @@ static int kimage_load_crash_segment(struct kimage *image, else buf += mchunk; mbytes -= mchunk; + + cond_resched(); } out: return result; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 75d8e7cf040e..c6a3b6851372 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -793,7 +793,7 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * The section headers in kexec_purgatory are read-only. In order to * have them modifiable make a temporary copy. */ - sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + sechdrs = vzalloc(array_size(sizeof(Elf_Shdr), pi->ehdr->e_shnum)); if (!sechdrs) return -ENOMEM; memcpy(sechdrs, (void *)pi->ehdr + pi->ehdr->e_shoff, diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 6850ffd69125..8402b3349dca 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -913,7 +913,9 @@ static int __init lock_torture_init(void) /* Initialize the statistics so that each run gets its own numbers. */ if (nwriters_stress) { lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress, + sizeof(*cxt.lwsa), + GFP_KERNEL); if (cxt.lwsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); firsterr = -ENOMEM; @@ -942,7 +944,9 @@ static int __init lock_torture_init(void) if (nreaders_stress) { lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress, + sizeof(*cxt.lrsa), + GFP_KERNEL); if (cxt.lrsa == NULL) { VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); firsterr = -ENOMEM; @@ -985,7 +989,8 @@ static int __init lock_torture_init(void) } if (nwriters_stress) { - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + writer_tasks = kcalloc(cxt.nrealwriters_stress, + sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); @@ -995,7 +1000,8 @@ static int __init lock_torture_init(void) } if (cxt.cur_ops->readlock) { - reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]), + reader_tasks = kcalloc(cxt.nrealreaders_stress, + sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); diff --git a/kernel/memremap.c b/kernel/memremap.c index 895e6b76b25e..5857267a4af5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/radix-tree.h> #include <linux/device.h> #include <linux/types.h> @@ -19,170 +9,8 @@ #include <linux/memory_hotplug.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/wait_bit.h> -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - -#ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) -{ - return (__force void *)ioremap_cache(offset, size); -} -#endif - -#ifndef arch_memremap_can_ram_remap -static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - return true; -} -#endif - -static void *try_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - unsigned long pfn = PHYS_PFN(offset); - - /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && - arch_memremap_can_ram_remap(offset, size, flags)) - return __va(offset); - - return NULL; /* fallback to arch_memremap_wb */ -} - -/** - * memremap() - remap an iomem_resource as cacheable memory - * @offset: iomem resource start address - * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, - * MEMREMAP_ENC, MEMREMAP_DEC - * - * memremap() is "ioremap" for cases where it is known that the resource - * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. In the case of multiple flags, the different - * mapping types will be attempted in the order listed below until one of - * them succeeds. - * - * MEMREMAP_WB - matches the default mapping for System RAM on - * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM - * memremap() will bypass establishing a new mapping and instead return - * a pointer into the direct map. - * - * MEMREMAP_WT - establish a mapping whereby writes either bypass the - * cache or are written through to memory and never exist in a - * cache-dirty state with respect to program visibility. Attempts to - * map System RAM with this mapping type will fail. - * - * MEMREMAP_WC - establish a writecombine mapping, whereby writes may - * be coalesced together (e.g. in the CPU's write buffers), but is otherwise - * uncached. Attempts to map System RAM with this mapping type will fail. - */ -void *memremap(resource_size_t offset, size_t size, unsigned long flags) -{ - int is_ram = region_intersects(offset, size, - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - void *addr = NULL; - - if (!flags) - return NULL; - - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - /* Try all mapping types requested until one returns non-NULL */ - if (flags & MEMREMAP_WB) { - /* - * MEMREMAP_WB is special in that it can be satisifed - * from the direct map. Some archs depend on the - * capability of memremap() to autodetect cases where - * the requested range is potentially in System RAM. - */ - if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size, flags); - if (!addr) - addr = arch_memremap_wb(offset, size); - } - - /* - * If we don't have a mapping yet and other request flags are - * present then we will be attempting to establish a new virtual - * address mapping. Enforce that this mapping is not aliasing - * System RAM. - */ - if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { - WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - if (!addr && (flags & MEMREMAP_WT)) - addr = ioremap_wt(offset, size); - - if (!addr && (flags & MEMREMAP_WC)) - addr = ioremap_wc(offset, size); - - return addr; -} -EXPORT_SYMBOL(memremap); - -void memunmap(void *addr) -{ - if (is_vmalloc_addr(addr)) - iounmap((void __iomem *) addr); -} -EXPORT_SYMBOL(memunmap); - -static void devm_memremap_release(struct device *dev, void *res) -{ - memunmap(*(void **)res); -} - -static int devm_memremap_match(struct device *dev, void *res, void *match_data) -{ - return *(void **)res == match_data; -} - -void *devm_memremap(struct device *dev, resource_size_t offset, - size_t size, unsigned long flags) -{ - void **ptr, *addr; - - ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, - dev_to_node(dev)); - if (!ptr) - return ERR_PTR(-ENOMEM); - - addr = memremap(offset, size, flags); - if (addr) { - *ptr = addr; - devres_add(dev, ptr); - } else { - devres_free(ptr); - return ERR_PTR(-ENXIO); - } - - return addr; -} -EXPORT_SYMBOL(devm_memremap); - -void devm_memunmap(struct device *dev, void *addr) -{ - WARN_ON(devres_release(dev, devm_memremap_release, - devm_memremap_match, addr)); -} -EXPORT_SYMBOL(devm_memunmap); - -#ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return pgmap; } -#endif /* CONFIG_ZONE_DEVICE */ +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL_GPL(devmap_managed_key); +static atomic_t devmap_enable; + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +void dev_pagemap_get_ops(void) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); + +void dev_pagemap_put_ops(void) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page) +void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_or_public_page); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/module.c b/kernel/module.c index c9bea7f2b43e..f475f30eed8c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -274,9 +274,7 @@ static void module_assert_mutex_or_preempt(void) } static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); -#ifndef CONFIG_MODULE_SIG_FORCE module_param(sig_enforce, bool_enable_only, 0644); -#endif /* !CONFIG_MODULE_SIG_FORCE */ /* * Export sig_enforce kernel cmdline parameter to allow other subsystems rely @@ -1604,8 +1602,7 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info) if (notes == 0) return; - notes_attrs = kzalloc(sizeof(*notes_attrs) - + notes * sizeof(notes_attrs->attrs[0]), + notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes), GFP_KERNEL); if (notes_attrs == NULL) return; @@ -2786,7 +2783,7 @@ static int module_sig_check(struct load_info *info, int flags) } /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !sig_enforce) + if (err == -ENOKEY && !is_module_sig_enforced()) err = 0; return err; diff --git a/kernel/panic.c b/kernel/panic.c index 42e487488554..8b2e002d52eb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -623,7 +623,7 @@ static __init int register_warn_debugfs(void) device_initcall(register_warn_debugfs); #endif -#ifdef CONFIG_CC_STACKPROTECTOR +#ifdef CONFIG_STACKPROTECTOR /* * Called when gcc's -fstack-protector feature is used, and diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 5454cc639a8d..9c85c7822383 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -287,6 +287,8 @@ static int create_image(int platform_mode) local_irq_disable(); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (error) { pr_err("Some system devices failed to power down, aborting hibernation\n"); @@ -317,6 +319,7 @@ static int create_image(int platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; error = syscore_suspend(); if (error) @@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode) syscore_resume(); Enable_irqs: + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: @@ -563,6 +568,7 @@ int hibernation_platform_enter(void) goto Enable_cpus; local_irq_disable(); + system_state = SYSTEM_SUSPEND; syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; @@ -575,6 +581,7 @@ int hibernation_platform_enter(void) Power_up: syscore_resume(); + system_state = SYSTEM_RUNNING; local_irq_enable(); Enable_cpus: diff --git a/kernel/power/main.c b/kernel/power/main.c index 705c2366dafe..d9706da10930 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -455,8 +455,9 @@ struct kobject *power_kobj; * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", - * "freeze" and "disk" (hibernation). See Documentation/power/states.txt for a - * description of what they mean. + * "freeze" and "disk" (hibernation). + * See Documentation/admin-guide/pm/sleep-states.rst for a description of + * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. diff --git a/kernel/power/qos.c b/kernel/power/qos.c index fa39092b7aea..86d72ffb811b 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -184,7 +184,6 @@ static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) c->target_value = value; } -static inline int pm_qos_get_value(struct pm_qos_constraints *c); static int pm_qos_dbg_show_requests(struct seq_file *s, void *unused) { struct pm_qos_object *qos = (struct pm_qos_object *)s->private; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4c10be0f4843..87331565e505 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/swait.h> #include <linux/ftrace.h> #include <trace/events/power.h> #include <linux/compiler.h> @@ -57,10 +58,10 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags); static const struct platform_suspend_ops *suspend_ops; static const struct platform_s2idle_ops *s2idle_ops; -static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; -static DEFINE_SPINLOCK(s2idle_lock); +static DEFINE_RAW_SPINLOCK(s2idle_lock); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { @@ -78,12 +79,12 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; s2idle_state = S2IDLE_STATE_ENTER; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); get_online_cpus(); cpuidle_resume(); @@ -91,17 +92,17 @@ static void s2idle_enter(void) /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ - wait_event(s2idle_wait_head, - s2idle_state == S2IDLE_STATE_WAKE); + swait_event(s2idle_wait_head, + s2idle_state == S2IDLE_STATE_WAKE); cpuidle_pause(); put_online_cpus(); - spin_lock_irq(&s2idle_lock); + raw_spin_lock_irq(&s2idle_lock); out: s2idle_state = S2IDLE_STATE_NONE; - spin_unlock_irq(&s2idle_lock); + raw_spin_unlock_irq(&s2idle_lock); trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, false); } @@ -156,12 +157,12 @@ void s2idle_wake(void) { unsigned long flags; - spin_lock_irqsave(&s2idle_lock, flags); + raw_spin_lock_irqsave(&s2idle_lock, flags); if (s2idle_state > S2IDLE_STATE_NONE) { s2idle_state = S2IDLE_STATE_WAKE; - wake_up(&s2idle_wait_head); + swake_up(&s2idle_wait_head); } - spin_unlock_irqrestore(&s2idle_lock, flags); + raw_spin_unlock_irqrestore(&s2idle_lock, flags); } EXPORT_SYMBOL_GPL(s2idle_wake); @@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); + system_state = SYSTEM_SUSPEND; + error = syscore_suspend(); if (!error) { *wakeup = pm_wakeup_pending(); @@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) syscore_resume(); } + system_state = SYSTEM_RUNNING; + arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1efcb5b0c3ed..c2bcf97d24c8 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -698,7 +698,7 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vmalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; @@ -1183,14 +1183,14 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); + page = vmalloc(array_size(LZO_MAX_RD_PAGES, sizeof(*page))); if (!page) { pr_err("Failed to allocate LZO page\n"); ret = -ENOMEM; goto out_clean; } - data = vmalloc(sizeof(*data) * nr_threads); + data = vmalloc(array_size(nr_threads, sizeof(*data))); if (!data) { pr_err("Failed to allocate LZO data\n"); ret = -ENOMEM; diff --git a/kernel/power/user.c b/kernel/power/user.c index 75c959de4b29..abd225550271 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -186,6 +186,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, res = PAGE_SIZE - pg_offp; } + if (!data_of(data->handle)) { + res = -EINVAL; + goto unlock; + } + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index dfba59be190b..4210152e56f0 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -188,6 +188,7 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len, return ERR_PTR(-ENOMEM); } wl->ws.name = wl->name; + wl->ws.last_time = ktime_get(); wakeup_source_add(&wl->ws); rb_link_node(&wl->node, parent, node); rb_insert_color(&wl->node, &wakelocks_tree); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2f4af216bd6e..247808333ba4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1908,6 +1908,7 @@ asmlinkage int vprintk_emit(int facility, int level, preempt_enable(); } + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); @@ -2289,9 +2290,7 @@ void console_unlock(void) { static char ext_text[CONSOLE_EXT_LOG_MAX]; static char text[LOG_LINE_MAX + PREFIX_MAX]; - static u64 seen_seq; unsigned long flags; - bool wake_klogd = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2335,11 +2334,6 @@ again: printk_safe_enter_irqsave(flags); raw_spin_lock(&logbuf_lock); - if (seen_seq != log_next_seq) { - wake_klogd = true; - seen_seq = log_next_seq; - } - if (console_seq < log_first_seq) { len = sprintf(text, "** %u printk messages dropped **\n", (unsigned)(log_first_seq - console_seq)); @@ -2397,7 +2391,7 @@ skip: if (console_lock_spinning_disable_and_check()) { printk_safe_exit_irqrestore(flags); - goto out; + return; } printk_safe_exit_irqrestore(flags); @@ -2429,10 +2423,6 @@ skip: if (retry && console_trylock()) goto again; - -out: - if (wake_klogd) - wake_up_klogd(); } EXPORT_SYMBOL(console_unlock); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 3e3c2004bb23..d7d091309054 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -82,6 +82,7 @@ static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, { int add; size_t len; + va_list ap; again: len = atomic_read(&s->len); @@ -100,7 +101,9 @@ again: if (!len) smp_rmb(); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + va_copy(ap, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); + va_end(ap); if (!add) return 0; @@ -278,7 +281,7 @@ void printk_safe_flush_on_panic(void) * Make sure that we could access the main ring buffer. * Do not risk a double release when more CPUs are up. */ - if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) { + if (raw_spin_is_locked(&logbuf_lock)) { if (num_online_cpus() > 1) return; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e628fcfd1bde..42fcb7f05fac 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -831,8 +831,9 @@ rcu_torture_cbflood(void *arg) cbflood_intra_holdoff > 0 && cur_ops->call && cur_ops->cb_barrier) { - rhp = vmalloc(sizeof(*rhp) * - cbflood_n_burst * cbflood_n_per_burst); + rhp = vmalloc(array3_size(cbflood_n_burst, + cbflood_n_per_burst, + sizeof(*rhp))); err = !rhp; } if (err) { diff --git a/kernel/relay.c b/kernel/relay.c index c955b10c973c..04f248644e06 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -39,7 +39,7 @@ static void relay_file_mmap_close(struct vm_area_struct *vma) /* * fault() vm_op implementation for relay file mapping. */ -static int relay_buf_fault(struct vm_fault *vmf) +static vm_fault_t relay_buf_fault(struct vm_fault *vmf) { struct page *page; struct rchan_buf *buf = vmf->vma->vm_private_data; @@ -169,7 +169,8 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) return NULL; - buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); + buf->padding = kmalloc_array(chan->n_subbufs, sizeof(size_t *), + GFP_KERNEL); if (!buf->padding) goto free_buf; diff --git a/kernel/resource.c b/kernel/resource.c index b589dda910b3..30e1bc68503b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -415,6 +415,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return __walk_iomem_res_desc(&res, desc, false, arg, func); } +EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type diff --git a/kernel/rseq.c b/kernel/rseq.c new file mode 100644 index 000000000000..ae306f90c514 --- /dev/null +++ b/kernel/rseq.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Restartable sequences system call + * + * Copyright (C) 2015, Google, Inc., + * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> + * Copyright (C) 2015-2018, EfficiOS Inc., + * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + */ + +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/rseq.h> +#include <linux/types.h> +#include <asm/ptrace.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> + +#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \ + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) + +/* + * + * Restartable sequences are a lightweight interface that allows + * user-level code to be executed atomically relative to scheduler + * preemption and signal delivery. Typically used for implementing + * per-cpu operations. + * + * It allows user-space to perform update operations on per-cpu data + * without requiring heavy-weight atomic operations. + * + * Detailed algorithm of rseq user-space assembly sequences: + * + * init(rseq_cs) + * cpu = TLS->rseq::cpu_id_start + * [1] TLS->rseq::rseq_cs = rseq_cs + * [start_ip] ---------------------------- + * [2] if (cpu != TLS->rseq::cpu_id) + * goto abort_ip; + * [3] <last_instruction_in_cs> + * [post_commit_ip] ---------------------------- + * + * The address of jump target abort_ip must be outside the critical + * region, i.e.: + * + * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] + * + * Steps [2]-[3] (inclusive) need to be a sequence of instructions in + * userspace that can handle being interrupted between any of those + * instructions, and then resumed to the abort_ip. + * + * 1. Userspace stores the address of the struct rseq_cs assembly + * block descriptor into the rseq_cs field of the registered + * struct rseq TLS area. This update is performed through a single + * store within the inline assembly instruction sequence. + * [start_ip] + * + * 2. Userspace tests to check whether the current cpu_id field match + * the cpu number loaded before start_ip, branching to abort_ip + * in case of a mismatch. + * + * If the sequence is preempted or interrupted by a signal + * at or after start_ip and before post_commit_ip, then the kernel + * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return + * ip to abort_ip before returning to user-space, so the preempted + * execution resumes at abort_ip. + * + * 3. Userspace critical section final instruction before + * post_commit_ip is the commit. The critical section is + * self-terminating. + * [post_commit_ip] + * + * 4. <success> + * + * On failure at [2], or if interrupted by preempt or signal delivery + * between [1] and [3]: + * + * [abort_ip] + * F1. <failure> + */ + +static int rseq_update_cpu_id(struct task_struct *t) +{ + u32 cpu_id = raw_smp_processor_id(); + + if (__put_user(cpu_id, &t->rseq->cpu_id_start)) + return -EFAULT; + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + trace_rseq_update(t); + return 0; +} + +static int rseq_reset_rseq_cpu_id(struct task_struct *t) +{ + u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + + /* + * Reset cpu_id_start to its initial state (0). + */ + if (__put_user(cpu_id_start, &t->rseq->cpu_id_start)) + return -EFAULT; + /* + * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming + * in after unregistration can figure out that rseq needs to be + * registered again. + */ + if (__put_user(cpu_id, &t->rseq->cpu_id)) + return -EFAULT; + return 0; +} + +static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +{ + struct rseq_cs __user *urseq_cs; + unsigned long ptr; + u32 __user *usig; + u32 sig; + int ret; + + ret = __get_user(ptr, &t->rseq->rseq_cs); + if (ret) + return ret; + if (!ptr) { + memset(rseq_cs, 0, sizeof(*rseq_cs)); + return 0; + } + urseq_cs = (struct rseq_cs __user *)ptr; + if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return -EFAULT; + if (rseq_cs->version > 0) + return -EINVAL; + + /* Ensure that abort_ip is not in the critical section. */ + if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) + return -EINVAL; + + usig = (u32 __user *)(rseq_cs->abort_ip - sizeof(u32)); + ret = get_user(sig, usig); + if (ret) + return ret; + + if (current->rseq_sig != sig) { + printk_ratelimited(KERN_WARNING + "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", + sig, current->rseq_sig, current->pid, usig); + return -EPERM; + } + return 0; +} + +static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +{ + u32 flags, event_mask; + int ret; + + /* Get thread flags. */ + ret = __get_user(flags, &t->rseq->flags); + if (ret) + return ret; + + /* Take critical section flags into account. */ + flags |= cs_flags; + + /* + * Restart on signal can only be inhibited when restart on + * preempt and restart on migrate are inhibited too. Otherwise, + * a preempted signal handler could fail to restart the prior + * execution context on sigreturn. + */ + if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) && + (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) != + RSEQ_CS_PREEMPT_MIGRATE_FLAGS)) + return -EINVAL; + + /* + * Load and clear event mask atomically with respect to + * scheduler preemption. + */ + preempt_disable(); + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + preempt_enable(); + + return !!(event_mask & ~flags); +} + +static int clear_rseq_cs(struct task_struct *t) +{ + /* + * The rseq_cs field is set to NULL on preemption or signal + * delivery on top of rseq assembly block, as well as on top + * of code outside of the rseq assembly block. This performs + * a lazy clear of the rseq_cs field. + * + * Set rseq_cs to NULL with single-copy atomicity. + */ + return __put_user(0UL, &t->rseq->rseq_cs); +} + +/* + * Unsigned comparison will be true when ip >= start_ip, and when + * ip < start_ip + post_commit_offset. + */ +static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +{ + return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; +} + +static int rseq_ip_fixup(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + int ret; + + ret = rseq_get_rseq_cs(t, &rseq_cs); + if (ret) + return ret; + + /* + * Handle potentially not being within a critical section. + * If not nested over a rseq critical section, restart is useless. + * Clear the rseq_cs pointer and return. + */ + if (!in_rseq_cs(ip, &rseq_cs)) + return clear_rseq_cs(t); + ret = rseq_need_restart(t, rseq_cs.flags); + if (ret <= 0) + return ret; + ret = clear_rseq_cs(t); + if (ret) + return ret; + trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, + rseq_cs.abort_ip); + instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); + return 0; +} + +/* + * This resume handler must always be executed between any of: + * - preemption, + * - signal delivery, + * and return to user-space. + * + * This is how we can ensure that the entire rseq critical section, + * consisting of both the C part and the assembly instruction sequence, + * will issue the commit instruction only if executed atomically with + * respect to other threads scheduled on the same CPU, and with respect + * to signal handlers. + */ +void __rseq_handle_notify_resume(struct pt_regs *regs) +{ + struct task_struct *t = current; + int ret; + + if (unlikely(t->flags & PF_EXITING)) + return; + if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + goto error; + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + if (unlikely(rseq_update_cpu_id(t))) + goto error; + return; + +error: + force_sig(SIGSEGV, t); +} + +#ifdef CONFIG_DEBUG_RSEQ + +/* + * Terminate the process if a syscall is issued within a restartable + * sequence. + */ +void rseq_syscall(struct pt_regs *regs) +{ + unsigned long ip = instruction_pointer(regs); + struct task_struct *t = current; + struct rseq_cs rseq_cs; + + if (!t->rseq) + return; + if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) + force_sig(SIGSEGV, t); +} + +#endif + +/* + * sys_rseq - setup restartable sequences for caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, + int, flags, u32, sig) +{ + int ret; + + if (flags & RSEQ_FLAG_UNREGISTER) { + /* Unregister rseq for current thread. */ + if (current->rseq != rseq || !current->rseq) + return -EINVAL; + if (current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + ret = rseq_reset_rseq_cpu_id(current); + if (ret) + return ret; + current->rseq = NULL; + current->rseq_len = 0; + current->rseq_sig = 0; + return 0; + } + + if (unlikely(flags)) + return -EINVAL; + + if (current->rseq) { + /* + * If rseq is already registered, check whether + * the provided address differs from the prior + * one. + */ + if (current->rseq != rseq || current->rseq_len != rseq_len) + return -EINVAL; + if (current->rseq_sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; + } + + /* + * If there was no rseq previously registered, + * ensure the provided rseq is properly aligned and valid. + */ + if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || + rseq_len != sizeof(*rseq)) + return -EINVAL; + if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + return -EFAULT; + current->rseq = rseq; + current->rseq_len = rseq_len; + current->rseq_sig = sig; + /* + * If rseq was previously inactive, and has just been + * registered, ensure the cpu_id_start and cpu_id fields + * are updated before returning to user-space. + */ + rseq_set_notify_resume(current); + + return 0; +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9866f86f304..78d8facba456 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -10,6 +10,8 @@ #include <linux/kthread.h> #include <linux/nospec.h> +#include <linux/kcov.h> + #include <asm/switch_to.h> #include <asm/tlb.h> @@ -1191,6 +1193,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; + rseq_migrate(p); perf_event_task_migrate(p); } @@ -2632,8 +2635,10 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); + rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); prepare_task(next); prepare_arch_switch(next); @@ -2700,6 +2705,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_task(prev); finish_lock_switch(rq); finish_arch_post_lock_switch(); + kcov_finish_switch(current); fire_sched_in_preempt_notifiers(current); /* diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28592b62b1d5..3cde46483f0a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -51,7 +51,7 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; unsigned int iowait_boost_max; - u64 last_update; + u64 last_update; /* The fields below are only needed when sharing a policy: */ unsigned long util_cfs; @@ -89,46 +89,52 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) * schedule the kthread. */ if (sg_policy->policy->fast_switch_enabled && - !cpufreq_can_do_remote_dvfs(sg_policy->policy)) + !cpufreq_this_cpu_can_update(sg_policy->policy)) return false; - if (sg_policy->work_in_progress) - return false; - - if (unlikely(sg_policy->need_freq_update)) { - sg_policy->need_freq_update = false; - /* - * This happens when limits change, so forget the previous - * next_freq value and force an update. - */ - sg_policy->next_freq = UINT_MAX; + if (unlikely(sg_policy->need_freq_update)) return true; - } delta_ns = time - sg_policy->last_freq_update_time; return delta_ns >= sg_policy->freq_update_delay_ns; } -static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, - unsigned int next_freq) +static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) { - struct cpufreq_policy *policy = sg_policy->policy; - if (sg_policy->next_freq == next_freq) - return; + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; - if (policy->fast_switch_enabled) { - next_freq = cpufreq_driver_fast_switch(policy, next_freq); - if (!next_freq) - return; + return true; +} - policy->cur = next_freq; - trace_cpu_frequency(next_freq, smp_processor_id()); - } else { +static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (!next_freq) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); +} + +static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + if (!sugov_update_next_freq(sg_policy, time, next_freq)) + return; + + if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -165,8 +171,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; + + sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -200,43 +208,120 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) +/** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @set_iowait_boost: true if an IO boost has been requested + * + * The IO wait boost of a task is disabled after a tick since the last update + * of a CPU. If a new IO wait boost is requested after more then a tick, then + * we enable the boost starting from the minimum frequency, which improves + * energy efficiency by ignoring sporadic wakeups from IO. + */ +static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, + bool set_iowait_boost) { - if (flags & SCHED_CPUFREQ_IOWAIT) { - if (sg_cpu->iowait_boost_pending) - return; + s64 delta_ns = time - sg_cpu->last_update; - sg_cpu->iowait_boost_pending = true; + /* Reset boost only if a tick has elapsed since last request */ + if (delta_ns <= TICK_NSEC) + return false; - if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; - } else { - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; - } - } else if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; + sg_cpu->iowait_boost = set_iowait_boost + ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost_pending = set_iowait_boost; - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } + return true; +} + +/** + * sugov_iowait_boost() - Updates the IO boost status of a CPU. + * @sg_cpu: the sugov data for the CPU to boost + * @time: the update time from the caller + * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait + * + * Each time a task wakes up after an IO operation, the CPU utilization can be + * boosted to a certain utilization which doubles at each "frequent and + * successive" wakeup from IO, ranging from the utilization of the minimum + * OPP to the utilization of the maximum OPP. + * To keep doubling, an IO boost has to be requested at least once per tick, + * otherwise we restart from the utilization of the minimum OPP. + */ +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT; + + /* Reset boost if the CPU appears to have been idle enough */ + if (sg_cpu->iowait_boost && + sugov_iowait_reset(sg_cpu, time, set_iowait_boost)) + return; + + /* Boost only tasks waking up after IO */ + if (!set_iowait_boost) + return; + + /* Ensure boost doubles only one time at each request */ + if (sg_cpu->iowait_boost_pending) + return; + sg_cpu->iowait_boost_pending = true; + + /* Double the boost at each request */ + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + return; } + + /* First wakeup after IO: start with minimum boost */ + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } -static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, - unsigned long *max) +/** + * sugov_iowait_apply() - Apply the IO boost to a CPU. + * @sg_cpu: the sugov data for the cpu to boost + * @time: the update time from the caller + * @util: the utilization to (eventually) boost + * @max: the maximum value the utilization can be boosted to + * + * A CPU running a task which woken up after an IO operation can have its + * utilization boosted to speed up the completion of those IO operations. + * The IO boost value is increased each time a task wakes up from IO, in + * sugov_iowait_apply(), and it's instead decreased by this function, + * each time an increase has not been requested (!iowait_boost_pending). + * + * A CPU which also appears to have been idle for at least one tick has also + * its IO boost utilization reset. + * + * This mechanism is designed to boost high frequently IO waiting tasks, while + * being more conservative on tasks which does sporadic IO operations. + */ +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long *util, unsigned long *max) { unsigned int boost_util, boost_max; + /* No boost currently required */ if (!sg_cpu->iowait_boost) return; + /* Reset boost if the CPU appears to have been idle enough */ + if (sugov_iowait_reset(sg_cpu, time, false)) + return; + + /* + * An IO waiting task has just woken up: + * allow to further double the boost value + */ if (sg_cpu->iowait_boost_pending) { sg_cpu->iowait_boost_pending = false; } else { + /* + * Otherwise: reduce the boost value and disable it when we + * reach the minimum. + */ sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { sg_cpu->iowait_boost = 0; @@ -244,9 +329,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, } } + /* + * Apply the current boost value: a CPU is boosted only if its current + * utilization is smaller then the current IO boost level. + */ boost_util = sg_cpu->iowait_boost; boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; @@ -285,7 +373,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); @@ -298,21 +386,31 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_aggregate_util(sg_cpu); - sugov_iowait_boost(sg_cpu, &util, &max); + sugov_iowait_apply(sg_cpu, time, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq && - sg_policy->next_freq != UINT_MAX) { + if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ sg_policy->cached_raw_freq = 0; } - sugov_update_commit(sg_policy, time, next_f); + /* + * This code runs under rq->lock for the target CPU, so it won't run + * concurrently on two different CPUs for the same target and it is not + * necessary to acquire the lock in the fast switch case. + */ + if (sg_policy->policy->fast_switch_enabled) { + sugov_fast_switch(sg_policy, time, next_f); + } else { + raw_spin_lock(&sg_policy->update_lock); + sugov_deferred_update(sg_policy, time, next_f); + raw_spin_unlock(&sg_policy->update_lock); + } } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) @@ -325,28 +423,12 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - s64 delta_ns; sugov_get_util(j_sg_cpu); - - /* - * If the CFS CPU utilization was last updated before the - * previous frequency update and the time elapsed between the - * last update of the CPU utilization and the last frequency - * update is long enough, reset iowait_boost and util_cfs, as - * they are now probably stale. However, still consider the - * CPU contribution if it has some DEADLINE utilization - * (util_dl). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > TICK_NSEC) { - j_sg_cpu->iowait_boost = 0; - j_sg_cpu->iowait_boost_pending = false; - } - j_max = j_sg_cpu->max; j_util = sugov_aggregate_util(j_sg_cpu); - sugov_iowait_boost(j_sg_cpu, &j_util, &j_max); + sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -365,14 +447,18 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) raw_spin_lock(&sg_policy->update_lock); - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; ignore_dl_rate_limit(sg_cpu, sg_policy); if (sugov_should_update_freq(sg_policy, time)) { next_f = sugov_next_freq_shared(sg_cpu, time); - sugov_update_commit(sg_policy, time, next_f); + + if (sg_policy->policy->fast_switch_enabled) + sugov_fast_switch(sg_policy, time, next_f); + else + sugov_deferred_update(sg_policy, time, next_f); } raw_spin_unlock(&sg_policy->update_lock); @@ -381,13 +467,27 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + unsigned int freq; + unsigned long flags; + + /* + * Hold sg_policy->update_lock shortly to handle the case where: + * incase sg_policy->next_freq is read here, and then updated by + * sugov_deferred_update() just before work_in_progress is set to false + * here, we may miss queueing the new update. + * + * Note: If a work was queued after the update_lock is released, + * sugov_work() will just be called again by kthread_work code; and the + * request will be proceed before the sugov thread sleeps. + */ + raw_spin_lock_irqsave(&sg_policy->update_lock, flags); + freq = sg_policy->next_freq; + sg_policy->work_in_progress = false; + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); mutex_lock(&sg_policy->work_lock); - __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, - CPUFREQ_RELATION_L); + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); mutex_unlock(&sg_policy->work_lock); - - sg_policy->work_in_progress = false; } static void sugov_irq_work(struct irq_work *irq_work) @@ -510,11 +610,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - - /* Kthread is bound to all CPUs by default */ - if (!policy->dvfs_possible_from_any_cpu) - kthread_bind_mask(thread, policy->related_cpus); - + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -657,7 +753,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; + sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e497c05aab7f..1866e64792a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10215,10 +10215,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL); if (!tg->se) goto err; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ef3c4e6f5345..47556b0c9a95 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -183,10 +183,10 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); if (!tg->rt_se) goto err; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 61a1125c1ae4..05a831427bc7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1750,7 +1750,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms) int i; cpumask_var_t *doms; - doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL); if (!doms) return NULL; for (i = 0; i < ndoms; i++) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e691d9a6c58d..fd023ac24e10 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -593,18 +593,15 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, } /* - * Force an audit message to be emitted when the action is RET_KILL_*, - * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is - * allowed to be logged by the admin. + * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the + * FILTER_FLAG_LOG bit was set. The admin has the ability to silence + * any action from being logged by removing the action name from the + * seccomp_actions_logged sysctl. */ - if (log) - return __audit_seccomp(syscall, signr, action); + if (!log) + return; - /* - * Let the audit subsystem decide if the action should be audited based - * on whether the current task itself is being audited. - */ - return audit_seccomp(syscall, signr, action); + audit_seccomp(syscall, signr, action); } /* @@ -1144,10 +1141,11 @@ static const struct seccomp_log_name seccomp_log_names[] = { }; static bool seccomp_names_from_actions_logged(char *names, size_t size, - u32 actions_logged) + u32 actions_logged, + const char *sep) { const struct seccomp_log_name *cur; - bool append_space = false; + bool append_sep = false; for (cur = seccomp_log_names; cur->name && size; cur++) { ssize_t ret; @@ -1155,15 +1153,15 @@ static bool seccomp_names_from_actions_logged(char *names, size_t size, if (!(actions_logged & cur->log)) continue; - if (append_space) { - ret = strscpy(names, " ", size); + if (append_sep) { + ret = strscpy(names, sep, size); if (ret < 0) return false; names += ret; size -= ret; } else - append_space = true; + append_sep = true; ret = strscpy(names, cur->name, size); if (ret < 0) @@ -1208,46 +1206,102 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + char names[sizeof(seccomp_actions_avail)]; + struct ctl_table table; + + memset(names, 0, sizeof(names)); + + if (!seccomp_names_from_actions_logged(names, sizeof(names), + seccomp_actions_logged, " ")) + return -EINVAL; + + table = *ro_table; + table.data = names; + table.maxlen = sizeof(names); + return proc_dostring(&table, 0, buffer, lenp, ppos); +} + +static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, + size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; struct ctl_table table; int ret; - if (write && !capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; memset(names, 0, sizeof(names)); - if (!write) { - if (!seccomp_names_from_actions_logged(names, sizeof(names), - seccomp_actions_logged)) - return -EINVAL; - } - table = *ro_table; table.data = names; table.maxlen = sizeof(names); - ret = proc_dostring(&table, write, buffer, lenp, ppos); + ret = proc_dostring(&table, 1, buffer, lenp, ppos); if (ret) return ret; - if (write) { - u32 actions_logged; + if (!seccomp_actions_logged_from_names(actions_logged, table.data)) + return -EINVAL; - if (!seccomp_actions_logged_from_names(&actions_logged, - table.data)) - return -EINVAL; + if (*actions_logged & SECCOMP_LOG_ALLOW) + return -EINVAL; - if (actions_logged & SECCOMP_LOG_ALLOW) - return -EINVAL; + seccomp_actions_logged = *actions_logged; + return 0; +} - seccomp_actions_logged = actions_logged; - } +static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, + int ret) +{ + char names[sizeof(seccomp_actions_avail)]; + char old_names[sizeof(seccomp_actions_avail)]; + const char *new = names; + const char *old = old_names; - return 0; + if (!audit_enabled) + return; + + memset(names, 0, sizeof(names)); + memset(old_names, 0, sizeof(old_names)); + + if (ret) + new = "?"; + else if (!actions_logged) + new = "(none)"; + else if (!seccomp_names_from_actions_logged(names, sizeof(names), + actions_logged, ",")) + new = "?"; + + if (!old_actions_logged) + old = "(none)"; + else if (!seccomp_names_from_actions_logged(old_names, + sizeof(old_names), + old_actions_logged, ",")) + old = "?"; + + return audit_seccomp_actions_logged(new, old, !ret); +} + +static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + if (write) { + u32 actions_logged = 0; + u32 old_actions_logged = seccomp_actions_logged; + + ret = write_actions_logged(ro_table, buffer, lenp, ppos, + &actions_logged); + audit_actions_logged(actions_logged, old_actions_logged, ret); + } else + ret = read_actions_logged(ro_table, buffer, lenp, ppos); + + return ret; } static struct ctl_path seccomp_sysctl_path[] = { diff --git a/kernel/signal.c b/kernel/signal.c index 0f865d67415d..8d8a940422a8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1244,19 +1244,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, { struct sighand_struct *sighand; + rcu_read_lock(); for (;;) { - /* - * Disable interrupts early to avoid deadlocks. - * See rcu_read_unlock() comment header for details. - */ - local_irq_save(*flags); - rcu_read_lock(); sighand = rcu_dereference(tsk->sighand); - if (unlikely(sighand == NULL)) { - rcu_read_unlock(); - local_irq_restore(*flags); + if (unlikely(sighand == NULL)) break; - } + /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which @@ -1268,15 +1261,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ - spin_lock(&sighand->siglock); - if (likely(sighand == tsk->sighand)) { - rcu_read_unlock(); + spin_lock_irqsave(&sighand->siglock, *flags); + if (likely(sighand == tsk->sighand)) break; - } - spin_unlock(&sighand->siglock); - rcu_read_unlock(); - local_irq_restore(*flags); + spin_unlock_irqrestore(&sighand->siglock, *flags); } + rcu_read_unlock(); return sighand; } diff --git a/kernel/softirq.c b/kernel/softirq.c index de2f57fddc04..900dcfee542c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -139,9 +139,13 @@ static void __local_bh_enable(unsigned int cnt) { lockdep_assert_irqs_disabled(); + if (preempt_count() == cnt) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); - preempt_count_sub(cnt); + + __preempt_count_sub(cnt); } /* diff --git a/kernel/sys.c b/kernel/sys.c index d1b2b8d934bb..38509dc1f77b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 183169c2a75b..df556175be50 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -367,7 +367,7 @@ COND_SYSCALL(s390_pci_mmio_write); COND_SYSCALL_COMPAT(s390_ipc); /* powerpc */ -cond_syscall(ppc_rtas); +COND_SYSCALL(rtas); COND_SYSCALL(spu_run); COND_SYSCALL(spu_create); COND_SYSCALL(subpage_prot); @@ -432,3 +432,6 @@ COND_SYSCALL(setresgid16); COND_SYSCALL(setresuid16); COND_SYSCALL(setreuid16); COND_SYSCALL(setuid16); + +/* restartable sequence */ +COND_SYSCALL(rseq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6a78cf70761d..2d9837c0aff4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3047,7 +3047,8 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), + sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 055a4a728c00..3e93c54bd3a1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1659,7 +1659,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..9cdf54b04ca8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -604,7 +604,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, /* * Disarm any old timer after extracting its expiry time. */ - lockdep_assert_irqs_disabled(); ret = 0; old_incr = timer->it.cpu.incr; @@ -1049,7 +1048,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - lockdep_assert_irqs_disabled(); arm_timer(timer); unlock: unlock_task_sighand(p, &flags); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 78e598334007..b7005dd21ec1 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -491,6 +491,7 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + system_state = SYSTEM_SUSPEND; timekeeping_suspend(); } else { tick_suspend_local(); @@ -514,6 +515,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); } else { diff --git a/kernel/time/time.c b/kernel/time/time.c index 6fa99213fc72..2b41e8e2d31d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -28,6 +28,7 @@ */ #include <linux/export.h> +#include <linux/kernel.h> #include <linux/timex.h> #include <linux/capability.h> #include <linux/timekeeper_internal.h> @@ -314,9 +315,10 @@ unsigned int jiffies_to_msecs(const unsigned long j) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; + return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> + HZ_TO_MSEC_SHR32; # else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; + return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c4f0f2e4126e..dcc0166d1997 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,22 +12,22 @@ config NOP_TRACER config HAVE_FTRACE_NMI_ENTER bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_FUNCTION_TRACER bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_FUNCTION_GRAPH_TRACER bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_DYNAMIC_FTRACE bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_DYNAMIC_FTRACE_WITH_REGS bool @@ -35,12 +35,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_FTRACE_MCOUNT_RECORD bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_SYSCALL_TRACEPOINTS bool help - See Documentation/trace/ftrace-design.txt + See Documentation/trace/ftrace-design.rst config HAVE_FENTRY bool @@ -110,11 +110,7 @@ config GENERIC_TRACER # config TRACING_SUPPORT bool - # PPC32 has no irqflags tracing support, but it can use most of the - # tracers anyway, they were tested to build and work. Note that new - # exceptions to this list aren't welcomed, better implement the - # irqflags tracing for your architecture. - depends on TRACE_IRQFLAGS_SUPPORT || PPC32 + depends on TRACE_IRQFLAGS_SUPPORT depends on STACKTRACE_SUPPORT default y @@ -452,7 +448,7 @@ config KPROBE_EVENTS help This allows the user to add tracing events (similar to tracepoints) on the fly via the ftrace interface. See - Documentation/trace/kprobetrace.txt for more details. + Documentation/trace/kprobetrace.rst for more details. Those events can be inserted wherever kprobes can probe, and record various register and memory values. @@ -579,7 +575,7 @@ config MMIOTRACE implementation and works via page faults. Tracing is disabled by default and can be enabled at run-time. - See Documentation/trace/mmiotrace.txt. + See Documentation/trace/mmiotrace.rst. If you are not helping to develop drivers, say N. config TRACING_MAP diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 56ba0f2a01db..0ae6829804bc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,12 +14,14 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/kprobes.h> +#include <linux/syscalls.h> #include <linux/error-injection.h> #include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); /** * trace_call_bpf - invoke BPF program @@ -474,8 +476,6 @@ BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; - if (unlikely(in_interrupt())) - return -EINVAL; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; @@ -564,6 +564,10 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_prandom_u32_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_str_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; +#endif default: return NULL; } @@ -577,6 +581,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE @@ -664,6 +670,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, + u64, flags) +{ + struct pt_regs *regs = *(struct pt_regs **)tp_buff; + + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_tp = { + .func = bpf_get_stack_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -672,6 +697,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; default: return tracing_func_proto(func_id, prog); } @@ -734,6 +761,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_tp; case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: @@ -744,7 +773,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output and/or bpf_get_stack_id + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack */ static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, @@ -787,6 +816,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); +} + +static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { + .func = bpf_get_stack_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -795,6 +844,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_event_output_proto_raw_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; + case BPF_FUNC_get_stack: + return &bpf_get_stack_proto_raw_tp; default: return tracing_func_proto(func_id, prog); } @@ -833,8 +884,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type return false; if (type != BPF_READ) return false; - if (off % size != 0) - return false; + if (off % size != 0) { + if (sizeof(unsigned long) != 4) + return false; + if (size != 8) + return false; + if (off % size != 4) + return false; + } switch (off) { case bpf_ctx_range(struct bpf_perf_event_data, sample_period): @@ -959,6 +1016,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event) old_array = event->tp_event->prog_array; ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret == -ENOENT) + goto unlock; if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); } else { @@ -1117,3 +1176,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) mutex_unlock(&bpf_event_mutex); return err; } + +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr) +{ + bool is_tracepoint, is_syscall_tp; + struct bpf_prog *prog; + int flags, err = 0; + + prog = event->prog; + if (!prog) + return -ENOENT; + + /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) + return -EOPNOTSUPP; + + *prog_id = prog->aux->id; + flags = event->tp_event->flags; + is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; + is_syscall_tp = is_syscall_trace_event(event->tp_event); + + if (is_tracepoint || is_syscall_tp) { + *buf = is_tracepoint ? event->tp_event->tp->name + : event->tp_event->name; + *fd_type = BPF_FD_TYPE_TRACEPOINT; + *probe_offset = 0x0; + *probe_addr = 0x0; + } else { + /* kprobe/uprobe */ + err = -EOPNOTSUPP; +#ifdef CONFIG_KPROBE_EVENTS + if (flags & TRACE_EVENT_FL_KPROBE) + err = bpf_get_kprobe_info(event, fd_type, buf, + probe_offset, probe_addr, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif +#ifdef CONFIG_UPROBE_EVENTS + if (flags & TRACE_EVENT_FL_UPROBE) + err = bpf_get_uprobe_info(event, fd_type, buf, + probe_offset, + event->attr.type == PERF_TYPE_TRACEPOINT); +#endif + } + + return err; +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8d83bcf9ef69..efed9c1cfb7e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -728,7 +728,7 @@ static int ftrace_profile_init_cpu(int cpu) */ size = FTRACE_PROFILE_HASH_SIZE; - stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kcalloc(size, sizeof(struct hlist_head), GFP_KERNEL); if (!stat->hash) return -ENOMEM; @@ -6830,9 +6830,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) struct task_struct *g, *t; for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { - ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack_list[i] = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack_list[i]) { start = 0; end = i; @@ -6904,9 +6905,9 @@ static int start_graph_tracing(void) struct ftrace_ret_stack **ret_stack_list; int ret, cpu; - ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * - sizeof(struct ftrace_ret_stack *), - GFP_KERNEL); + ret_stack_list = kmalloc_array(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); if (!ret_stack_list) return -ENOMEM; @@ -7088,9 +7089,10 @@ void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) ret_stack = per_cpu(idle_ret_stack, cpu); if (!ret_stack) { - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack = + kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack) return; per_cpu(idle_ret_stack, cpu) = ret_stack; @@ -7109,9 +7111,9 @@ void ftrace_graph_init_task(struct task_struct *t) if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; - ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH - * sizeof(struct ftrace_ret_stack), - GFP_KERNEL); + ret_stack = kmalloc_array(FTRACE_RETFUNC_DEPTH, + sizeof(struct ftrace_ret_stack), + GFP_KERNEL); if (!ret_stack) return; graph_init_task(t, ret_stack); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c9cb9767d49b..6a46af21765c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -809,7 +809,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); * * You can see, it is legitimate for the previous pointer of * the head (or any page) not to point back to itself. But only - * temporarially. + * temporarily. */ #define RB_PAGE_NORMAL 0UL @@ -906,7 +906,7 @@ static void rb_list_head_clear(struct list_head *list) } /* - * rb_head_page_dactivate - clears head page ptr (for free list) + * rb_head_page_deactivate - clears head page ptr (for free list) */ static void rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) @@ -1780,7 +1780,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, put_online_cpus(); } else { - /* Make sure this CPU has been intitialized */ + /* Make sure this CPU has been initialized */ if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) goto out; @@ -2325,7 +2325,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, /* * If we need to add a timestamp, then we - * add it to the start of the resevered space. + * add it to the start of the reserved space. */ if (unlikely(info->add_timestamp)) { bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); @@ -2681,7 +2681,7 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) * ring_buffer_nest_start - Allow to trace while nested * @buffer: The ring buffer to modify * - * The ring buffer has a safty mechanism to prevent recursion. + * The ring buffer has a safety mechanism to prevent recursion. * But there may be a case where a trace needs to be done while * tracing something else. In this case, calling this function * will allow this function to nest within a currently active @@ -2699,7 +2699,7 @@ void ring_buffer_nest_start(struct ring_buffer *buffer) preempt_disable_notrace(); cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - /* This is the shift value for the above recusive locking */ + /* This is the shift value for the above recursive locking */ cpu_buffer->nest += NESTED_BITS; } @@ -2718,7 +2718,7 @@ void ring_buffer_nest_end(struct ring_buffer *buffer) /* disabled by ring_buffer_nest_start() */ cpu = raw_smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - /* This is the shift value for the above recusive locking */ + /* This is the shift value for the above recursive locking */ cpu_buffer->nest -= NESTED_BITS; preempt_enable_notrace(); } @@ -2907,7 +2907,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, * @buffer: the ring buffer to reserve from * @length: the length of the data to reserve (excluding event header) * - * Returns a reseverd event on the ring buffer to copy directly to. + * Returns a reserved event on the ring buffer to copy directly to. * The user of this interface will need to get the body to write into * and can use the ring_buffer_event_data() interface. * @@ -3009,7 +3009,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, * This function lets the user discard an event in the ring buffer * and then that event will not be read later. * - * This function only works if it is called before the the item has been + * This function only works if it is called before the item has been * committed. It will try to free the event from the ring buffer * if another event has not been added behind it. * @@ -4127,7 +4127,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * through the buffer. Memory is allocated, buffer recording * is disabled, and the iterator pointer is returned to the caller. * - * Disabling buffer recordng prevents the reading from being + * Disabling buffer recording prevents the reading from being * corrupted. This is not a consuming read, so a producer is not * expected. * diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bcd93031d042..a0079b4c7a49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1360,8 +1360,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf; - if (tr->stop_count) return; @@ -1375,9 +1373,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); - buf = tr->trace_buffer.buffer; - tr->trace_buffer.buffer = tr->max_buffer.buffer; - tr->max_buffer.buffer = buf; + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); @@ -1751,12 +1747,13 @@ static inline void set_cmdline(int idx, const char *cmdline) static int allocate_cmdlines_buffer(unsigned int val, struct saved_cmdlines_buffer *s) { - s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); + s->map_cmdline_to_pid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_pid), + GFP_KERNEL); if (!s->map_cmdline_to_pid) return -ENOMEM; - s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL); + s->saved_cmdlines = kmalloc_array(TASK_COMM_LEN, val, GFP_KERNEL); if (!s->saved_cmdlines) { kfree(s->map_cmdline_to_pid); return -ENOMEM; @@ -4360,7 +4357,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map), + tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; @@ -4395,8 +4393,7 @@ static int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; - int ret = -ENODEV; - int i; + int ret; size_t orig_len = strlen(option); cmp = strstrip(option); @@ -4408,16 +4405,12 @@ static int trace_set_options(struct trace_array *tr, char *option) mutex_lock(&trace_types_lock); - for (i = 0; trace_options[i]; i++) { - if (strcmp(cmp, trace_options[i]) == 0) { - ret = set_tracer_flag(tr, 1 << i, !neg); - break; - } - } - + ret = match_string(trace_options, -1, cmp); /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) + if (ret < 0) ret = set_tracer_option(tr, cmp, neg); + else + ret = set_tracer_flag(tr, 1 << ret, !neg); mutex_unlock(&trace_types_lock); @@ -5068,7 +5061,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, * where the head holds the module and length of array, and the * tail holds a pointer to the next list. */ - map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); + map_array = kmalloc_array(len + 2, sizeof(*map_array), GFP_KERNEL); if (!map_array) { pr_warn("Unable to allocate trace eval mapping\n"); return; @@ -6074,6 +6067,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, { struct trace_array *tr = filp->private_data; struct ring_buffer_event *event; + enum event_trigger_type tt = ETT_NONE; struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; @@ -6122,6 +6116,12 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, written = cnt; len = cnt; + if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { + /* do not add \n before testing triggers, but add \0 */ + entry->buf[cnt] = '\0'; + tt = event_triggers_call(tr->trace_marker_file, entry, event); + } + if (entry->buf[cnt - 1] != '\n') { entry->buf[cnt] = '\n'; entry->buf[cnt + 1] = '\0'; @@ -6130,6 +6130,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); + if (tt) + event_triggers_post_call(tr->trace_marker_file, tt); + if (written > 0) *fpos += written; @@ -7896,6 +7899,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { + struct trace_event_file *file; int cpu; trace_create_file("available_tracers", 0444, d_tracer, @@ -7928,6 +7932,12 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + file = __find_event_file(tr, "ftrace", "print"); + if (file && file->dir) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); + tr->trace_marker_file = file; + trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); @@ -8111,6 +8121,8 @@ static __init int tracer_init_tracefs(void) if (IS_ERR(d_tracer)) return 0; + event_trace_init(); + init_tracer_tracefs(&global_trace, d_tracer); ftrace_init_tracefs_toplevel(&global_trace, d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 507954b4e058..630c5a24b2b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -259,6 +259,7 @@ struct trace_array { struct trace_options *topts; struct list_head systems; struct list_head events; + struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; #ifdef CONFIG_FUNCTION_TRACER @@ -1334,7 +1335,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) - event_triggers_post_call(file, tt, entry, event); + event_triggers_post_call(file, tt); } /** @@ -1367,7 +1368,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, irq_flags, pc, regs); if (tt) - event_triggers_post_call(file, tt, entry, event); + event_triggers_post_call(file, tt); } #define FILTER_PRED_INVALID ((unsigned short)-1) @@ -1451,9 +1452,13 @@ trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); +extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); +extern struct trace_event_file *__find_event_file(struct trace_array *tr, + const char *system, + const char *event); extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3a658bac10f..1d67464ed95e 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -230,7 +230,7 @@ FTRACE_ENTRY(bprint, bprint_entry, FILTER_OTHER ); -FTRACE_ENTRY(print, print_entry, +FTRACE_ENTRY_REG(print, print_entry, TRACE_PRINT, @@ -242,7 +242,9 @@ FTRACE_ENTRY(print, print_entry, F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), - FILTER_OTHER + FILTER_OTHER, + + ftrace_event_register ); FTRACE_ENTRY(raw_data, raw_data_entry, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 05c7172c6667..14ff4ff3caab 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2007,16 +2007,18 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); /* * Only event directories that can be enabled should have - * triggers. + * triggers or filters. */ - if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { + trace_create_file("filter", 0644, file->dir, file, + &ftrace_event_filter_fops); + trace_create_file("trigger", 0644, file->dir, file, &event_trigger_fops); + } #ifdef CONFIG_HIST_TRIGGERS trace_create_file("hist", 0444, file->dir, file, @@ -2473,8 +2475,9 @@ __trace_add_event_dirs(struct trace_array *tr) } } +/* Returns any file that matches the system and event */ struct trace_event_file * -find_event_file(struct trace_array *tr, const char *system, const char *event) +__find_event_file(struct trace_array *tr, const char *system, const char *event) { struct trace_event_file *file; struct trace_event_call *call; @@ -2485,10 +2488,7 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) call = file->event_call; name = trace_event_name(call); - if (!name || !call->class || !call->class->reg) - continue; - - if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + if (!name || !call->class) continue; if (strcmp(event, name) == 0 && @@ -2498,6 +2498,20 @@ find_event_file(struct trace_array *tr, const char *system, const char *event) return NULL; } +/* Returns valid trace event files that match system and event */ +struct trace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct trace_event_file *file; + + file = __find_event_file(tr, system, event); + if (!file || !file->event_call->class->reg || + file->event_call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + return NULL; + + return file; +} + #ifdef CONFIG_DYNAMIC_FTRACE /* Avoid typos */ @@ -3132,7 +3146,7 @@ static __init int event_trace_enable_again(void) early_initcall(event_trace_enable_again); -static __init int event_trace_init(void) +__init int event_trace_init(void) { struct trace_array *tr; struct dentry *d_tracer; @@ -3177,8 +3191,6 @@ void __init trace_event_init(void) event_trace_enable(); } -fs_initcall(event_trace_init); - #ifdef CONFIG_FTRACE_STARTUP_TEST static DEFINE_SPINLOCK(test_spinlock); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7d306b74230f..0dceb77d1d42 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -78,7 +78,8 @@ static const char * ops[] = { OPS }; C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ - C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FILTER, "No filter found"), #undef C #define C(a, b) FILT_ERR_##a @@ -436,15 +437,15 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, nr_preds += 2; /* For TRUE and FALSE */ - op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL); + op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL); + prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; } - inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL); + inverts = kmalloc_array(nr_preds, sizeof(*inverts), GFP_KERNEL); if (!inverts) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -550,6 +551,13 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, goto out_free; } + if (!N) { + /* No program? */ + ret = -EINVAL; + parse_error(pe, FILT_ERR_NO_FILTER, ptr - str); + goto out_free; + } + prog[N].pred = NULL; /* #13 */ prog[N].target = 1; /* TRUE */ prog[N+1].pred = NULL; @@ -750,31 +758,32 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING + * or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) { - if (strncmp(str, r->pattern, len) == 0) - return 1; - return 0; + /* len of zero means str is dynamic and ends with '\0' */ + if (!len) + return strcmp(str, r->pattern) == 0; + + return strncmp(str, r->pattern, len) == 0; } static int regex_match_front(char *str, struct regex *r, int len) { - if (len < r->len) + if (len && len < r->len) return 0; - if (strncmp(str, r->pattern, r->len) == 0) - return 1; - return 0; + return strncmp(str, r->pattern, r->len) == 0; } static int regex_match_middle(char *str, struct regex *r, int len) { - if (strnstr(str, r->pattern, len)) - return 1; - return 0; + if (!len) + return strstr(str, r->pattern) != NULL; + + return strnstr(str, r->pattern, len) != NULL; } static int regex_match_end(char *str, struct regex *r, int len) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b9061ed59bbd..046c716a6536 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2865,7 +2865,7 @@ static struct trace_event_file *event_file(struct trace_array *tr, { struct trace_event_file *file; - file = find_event_file(tr, system, event_name); + file = __find_event_file(tr, system, event_name); if (!file) return ERR_PTR(-EINVAL); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 8b5bdcf64871..d18249683682 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -97,7 +97,6 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke - * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -108,8 +107,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt, - void *rec, struct ring_buffer_event *event) + enum event_trigger_type tt) { struct event_trigger_data *data; @@ -117,7 +115,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, rec, event); + data->ops->func(data, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 548e62eb5c46..45630a76ed3a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -14,6 +14,13 @@ #include "trace_output.h" +/* Stub function for events with triggers */ +static int ftrace_event_register(struct trace_event_call *call, + enum trace_reg type, void *data) +{ + return 0; +} + #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace @@ -117,7 +124,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __dynamic_array #define __dynamic_array(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ + ret = trace_define_field(event_call, #type "[]", #item, \ offsetof(typeof(field), item), \ 0, is_signed_type(type), filter_type);\ if (ret) \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 02aed76e0978..daa81571b22a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, head, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); + +int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type, + const char **symbol, u64 *probe_offset, + u64 *probe_addr, bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_kprobe *tk; + + if (perf_type_tracepoint) + tk = find_trace_kprobe(pevent, group); + else + tk = event->tp_event->data; + if (!tk) + return -EINVAL; + + *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE + : BPF_FD_TYPE_KPROBE; + if (tk->symbol) { + *symbol = tk->symbol; + *probe_offset = tk->rp.kp.offset; + *probe_addr = 0; + } else { + *symbol = NULL; + *probe_offset = 0; + *probe_addr = (unsigned long)tk->rp.kp.addr; + } + return 0; +} #endif /* CONFIG_PERF_EVENTS */ /* diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ac892878dbe6..bf89a51e740d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, { __uprobe_perf_func(tu, func, regs, ucb, dsize); } + +int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, + const char **filename, u64 *probe_offset, + bool perf_type_tracepoint) +{ + const char *pevent = trace_event_name(event->tp_event); + const char *group = event->tp_event->class->system; + struct trace_uprobe *tu; + + if (perf_type_tracepoint) + tu = find_probe_event(pevent, group); + else + tu = event->tp_event->data; + if (!tu) + return -EINVAL; + + *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE + : BPF_FD_TYPE_UPROBE; + *filename = tu->filename; + *probe_offset = tu->offset; + return 0; +} #endif /* CONFIG_PERF_EVENTS */ static int diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 5cadb1b8b5fe..752d8042bad4 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -1075,7 +1075,7 @@ int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; - entries = vmalloc(map->max_elts * sizeof(sort_entry)); + entries = vmalloc(array_size(sizeof(sort_entry), map->max_elts)); if (!entries) return -ENOMEM; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1e37da2e0c25..6dc6356c3327 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -257,7 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, } /** - * tracepoint_probe_register - Connect a probe to a tracepoint + * tracepoint_probe_register_prio - Connect a probe to a tracepoint with priority * @tp: tracepoint * @probe: probe handler * @data: tracepoint data @@ -290,7 +290,6 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio); * @tp: tracepoint * @probe: probe handler * @data: tracepoint data - * @prio: priority of this function over other registered functions * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for diff --git a/kernel/umh.c b/kernel/umh.c index f76b3ff876cf..c449858946af 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -25,6 +25,8 @@ #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> +#include <linux/shmem_fs.h> +#include <linux/pipe_fs_i.h> #include <trace/events/module.h> @@ -97,9 +99,14 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + sub_info->pid = task_pid_nr(current); + if (sub_info->file) + retval = do_execve_file(sub_info->file, + sub_info->argv, sub_info->envp); + else + retval = do_execve(getname_kernel(sub_info->path), + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); out: sub_info->retval = retval; /* @@ -393,6 +400,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv, } EXPORT_SYMBOL(call_usermodehelper_setup); +struct subprocess_info *call_usermodehelper_setup_file(struct file *file, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), void *data) +{ + struct subprocess_info *sub_info; + + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL); + if (!sub_info) + return NULL; + + INIT_WORK(&sub_info->work, call_usermodehelper_exec_work); + sub_info->path = "none"; + sub_info->file = file; + sub_info->init = init; + sub_info->cleanup = cleanup; + sub_info->data = data; + return sub_info; +} + +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +{ + struct umh_info *umh_info = info->data; + struct file *from_umh[2]; + struct file *to_umh[2]; + int err; + + /* create pipe to send data to umh */ + err = create_pipe_files(to_umh, 0); + if (err) + return err; + err = replace_fd(0, to_umh[0], 0); + fput(to_umh[0]); + if (err < 0) { + fput(to_umh[1]); + return err; + } + + /* create pipe to receive data from umh */ + err = create_pipe_files(from_umh, 0); + if (err) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + return err; + } + err = replace_fd(1, from_umh[1], 0); + fput(from_umh[1]); + if (err < 0) { + fput(to_umh[1]); + replace_fd(0, NULL, 0); + fput(from_umh[0]); + return err; + } + + umh_info->pipe_to_umh = to_umh[1]; + umh_info->pipe_from_umh = from_umh[0]; + return 0; +} + +static void umh_save_pid(struct subprocess_info *info) +{ + struct umh_info *umh_info = info->data; + + umh_info->pid = info->pid; +} + +/** + * fork_usermode_blob - fork a blob of bytes as a usermode process + * @data: a blob of bytes that can be do_execv-ed as a file + * @len: length of the blob + * @info: information about usermode process (shouldn't be NULL) + * + * Returns either negative error or zero which indicates success + * in executing a blob of bytes as a usermode process. In such + * case 'struct umh_info *info' is populated with two pipes + * and a pid of the process. The caller is responsible for health + * check of the user process, killing it via pid, and closing the + * pipes when user process is no longer needed. + */ +int fork_usermode_blob(void *data, size_t len, struct umh_info *info) +{ + struct subprocess_info *sub_info; + struct file *file; + ssize_t written; + loff_t pos = 0; + int err; + + file = shmem_kernel_file_setup("", len, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + written = kernel_write(file, data, len, &pos); + if (written != len) { + err = written; + if (err >= 0) + err = -ENOMEM; + goto out; + } + + err = -ENOMEM; + sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup, + umh_save_pid, info); + if (!sub_info) + goto out; + + err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); +out: + fput(file); + return err; +} +EXPORT_SYMBOL_GPL(fork_usermode_blob); + /** * call_usermodehelper_exec - start a usermode application * @sub_info: information about the subprocessa diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 246d4d4ce5c7..c3d7583fcd21 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -764,8 +764,9 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ - forward = kmalloc(sizeof(struct uid_gid_extent) * - UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL); + forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, + sizeof(struct uid_gid_extent), + GFP_KERNEL); if (!forward) return -ENOMEM; @@ -1235,6 +1236,7 @@ bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } +EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca7959be8aaa..78b192071ef7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,7 +66,7 @@ enum { * be executing on any CPU. The pool behaves as an unbound one. * * Note that DISASSOCIATED should be flipped only while holding - * attach_mutex to avoid changing binding state while + * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ @@ -123,7 +123,7 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * A: pool->attach_mutex protected. + * A: wq_pool_attach_mutex protected. * * PL: wq_pool_mutex protected. * @@ -166,7 +166,6 @@ struct worker_pool { /* L: hash of busy workers */ struct worker *manager; /* L: purely informational */ - struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -297,6 +296,7 @@ static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ +static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ @@ -399,14 +399,14 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @worker: iteration cursor * @pool: worker_pool to iterate workers of * - * This must be called with @pool->attach_mutex. + * This must be called with wq_pool_attach_mutex. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool_worker(worker, pool) \ list_for_each_entry((worker), &(pool)->workers, node) \ - if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \ + if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \ else /** @@ -1724,7 +1724,7 @@ static struct worker *alloc_worker(int node) static void worker_attach_to_pool(struct worker *worker, struct worker_pool *pool) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any @@ -1733,37 +1733,40 @@ static void worker_attach_to_pool(struct worker *worker, set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains - * stable across this function. See the comments above the - * flag definition for details. + * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains + * stable across this function. See the comments above the flag + * definition for details. */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; list_add_tail(&worker->node, &pool->workers); + worker->pool = pool; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /** * worker_detach_from_pool() - detach a worker from its pool * @worker: worker which is attached to its pool - * @pool: the pool @worker is attached to * * Undo the attaching which had been done in worker_attach_to_pool(). The * caller worker shouldn't access to the pool after detached except it has * other reference to the pool. */ -static void worker_detach_from_pool(struct worker *worker, - struct worker_pool *pool) +static void worker_detach_from_pool(struct worker *worker) { + struct worker_pool *pool = worker->pool; struct completion *detach_completion = NULL; - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); + list_del(&worker->node); + worker->pool = NULL; + if (list_empty(&pool->workers)) detach_completion = pool->detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* clear leftover flags without pool->lock after it is detached */ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); @@ -1799,7 +1802,6 @@ static struct worker *create_worker(struct worker_pool *pool) if (!worker) goto fail; - worker->pool = pool; worker->id = id; if (pool->cpu >= 0) @@ -2086,6 +2088,12 @@ __acquires(&pool->lock) worker->current_pwq = pwq; work_color = get_work_color(work); + /* + * Record wq name for cmdline and debug reporting, may get + * overridden through set_worker_desc(). + */ + strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN); + list_del_init(&work->entry); /* @@ -2181,7 +2189,6 @@ __acquires(&pool->lock) worker->current_work = NULL; worker->current_func = NULL; worker->current_pwq = NULL; - worker->desc_valid = false; pwq_dec_nr_in_flight(pwq, work_color); } @@ -2206,6 +2213,16 @@ static void process_scheduled_works(struct worker *worker) } } +static void set_pf_worker(bool val) +{ + mutex_lock(&wq_pool_attach_mutex); + if (val) + current->flags |= PF_WQ_WORKER; + else + current->flags &= ~PF_WQ_WORKER; + mutex_unlock(&wq_pool_attach_mutex); +} + /** * worker_thread - the worker thread function * @__worker: self @@ -2224,7 +2241,7 @@ static int worker_thread(void *__worker) struct worker_pool *pool = worker->pool; /* tell the scheduler that this is a workqueue worker */ - worker->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); woke_up: spin_lock_irq(&pool->lock); @@ -2232,11 +2249,11 @@ woke_up: if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); WARN_ON_ONCE(!list_empty(&worker->entry)); - worker->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); set_task_comm(worker->task, "kworker/dying"); ida_simple_remove(&pool->worker_ida, worker->id); - worker_detach_from_pool(worker, pool); + worker_detach_from_pool(worker); kfree(worker); return 0; } @@ -2335,7 +2352,7 @@ static int rescuer_thread(void *__rescuer) * Mark rescuer as worker too. As WORKER_PREP is never cleared, it * doesn't participate in concurrency management. */ - rescuer->task->flags |= PF_WQ_WORKER; + set_pf_worker(true); repeat: set_current_state(TASK_IDLE); @@ -2367,7 +2384,6 @@ repeat: worker_attach_to_pool(rescuer, pool); spin_lock_irq(&pool->lock); - rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and @@ -2417,10 +2433,9 @@ repeat: if (need_more_worker(pool)) wake_up_worker(pool); - rescuer->pool = NULL; spin_unlock_irq(&pool->lock); - worker_detach_from_pool(rescuer, pool); + worker_detach_from_pool(rescuer); spin_lock_irq(&wq_mayday_lock); } @@ -2429,7 +2444,7 @@ repeat: if (should_stop) { __set_current_state(TASK_RUNNING); - rescuer->task->flags &= ~PF_WQ_WORKER; + set_pf_worker(false); return 0; } @@ -3271,7 +3286,6 @@ static int init_worker_pool(struct worker_pool *pool) timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0); - mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); ida_init(&pool->worker_ida); @@ -3354,10 +3368,10 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (!list_empty(&pool->workers)) pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); if (pool->detach_completion) wait_for_completion(pool->detach_completion); @@ -3700,8 +3714,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]), - GFP_KERNEL); + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); @@ -4347,9 +4360,9 @@ void set_worker_desc(const char *fmt, ...) va_start(args, fmt); vsnprintf(worker->desc, sizeof(worker->desc), fmt, args); va_end(args); - worker->desc_valid = true; } } +EXPORT_SYMBOL_GPL(set_worker_desc); /** * print_worker_info - print out worker information and description @@ -4371,7 +4384,6 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) char desc[WORKER_DESC_LEN] = { }; struct pool_workqueue *pwq = NULL; struct workqueue_struct *wq = NULL; - bool desc_valid = false; struct worker *worker; if (!(task->flags & PF_WQ_WORKER)) @@ -4384,22 +4396,18 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) worker = kthread_probe_data(task); /* - * Carefully copy the associated workqueue's workfn and name. Keep - * the original last '\0' in case the original contains garbage. + * Carefully copy the associated workqueue's workfn, name and desc. + * Keep the original last '\0' in case the original is garbage. */ probe_kernel_read(&fn, &worker->current_func, sizeof(fn)); probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq)); probe_kernel_read(&wq, &pwq->wq, sizeof(wq)); probe_kernel_read(name, wq->name, sizeof(name) - 1); - - /* copy worker description */ - probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid)); - if (desc_valid) - probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); + probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { printk("%sWorkqueue: %s %pf", log_lvl, name, fn); - if (desc[0]) + if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); } @@ -4579,6 +4587,47 @@ void show_workqueue_state(void) rcu_read_unlock_sched(); } +/* used to show worker information through /proc/PID/{comm,stat,status} */ +void wq_worker_comm(char *buf, size_t size, struct task_struct *task) +{ + int off; + + /* always show the actual comm */ + off = strscpy(buf, task->comm, size); + if (off < 0) + return; + + /* stabilize PF_WQ_WORKER and worker pool association */ + mutex_lock(&wq_pool_attach_mutex); + + if (task->flags & PF_WQ_WORKER) { + struct worker *worker = kthread_data(task); + struct worker_pool *pool = worker->pool; + + if (pool) { + spin_lock_irq(&pool->lock); + /* + * ->desc tracks information (wq name or + * set_worker_desc()) for the latest execution. If + * current, prepend '+', otherwise '-'. + */ + if (worker->desc[0] != '\0') { + if (worker->current_work) + scnprintf(buf + off, size - off, "+%s", + worker->desc); + else + scnprintf(buf + off, size - off, "-%s", + worker->desc); + } + spin_unlock_irq(&pool->lock); + } + } + + mutex_unlock(&wq_pool_attach_mutex); +} + +#ifdef CONFIG_SMP + /* * CPU hotplug. * @@ -4600,7 +4649,7 @@ static void unbind_workers(int cpu) struct worker *worker; for_each_cpu_worker_pool(pool, cpu) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); spin_lock_irq(&pool->lock); /* @@ -4616,7 +4665,7 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); /* * Call schedule() so that we cross rq->lock and thus can @@ -4657,7 +4706,7 @@ static void rebind_workers(struct worker_pool *pool) { struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* * Restore CPU affinity of all workers. As all idle workers should @@ -4727,7 +4776,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) static cpumask_t cpumask; struct worker *worker; - lockdep_assert_held(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_attach_mutex); /* is @cpu allowed for @pool? */ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) @@ -4762,14 +4811,14 @@ int workqueue_online_cpu(unsigned int cpu) mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { - mutex_lock(&pool->attach_mutex); + mutex_lock(&wq_pool_attach_mutex); if (pool->cpu == cpu) rebind_workers(pool); else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); + mutex_unlock(&wq_pool_attach_mutex); } /* update NUMA affinity of unbound workqueues */ @@ -4799,8 +4848,6 @@ int workqueue_offline_cpu(unsigned int cpu) return 0; } -#ifdef CONFIG_SMP - struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -5591,7 +5638,7 @@ static void __init wq_numa_init(void) * available. Build one from cpu_to_node() which should have been * fully initialized by now. */ - tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL); + tbl = kcalloc(nr_node_ids, sizeof(tbl[0]), GFP_KERNEL); BUG_ON(!tbl); for_each_node(node) diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index d390d1be3748..66fbb5a9e633 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -31,13 +31,12 @@ struct worker { struct work_struct *current_work; /* L: work being processed */ work_func_t current_func; /* L: current_work's fn */ struct pool_workqueue *current_pwq; /* L: current_work's pwq */ - bool desc_valid; /* ->desc is valid */ struct list_head scheduled; /* L: scheduled works */ /* 64 bytes boundary on 64bit, 32 on 32bit */ struct task_struct *task; /* I: worker task */ - struct worker_pool *pool; /* I: the associated pool */ + struct worker_pool *pool; /* A: the associated pool */ /* L: for rescuers */ struct list_head node; /* A: anchored at pool->workers */ /* A: runs through worker->node */ |